From 99b3b837855b987563bcfb397cf9ddd88262814b Mon Sep 17 00:00:00 2001
From: Qiujun Huang <hqjagain@gmail.com>
Date: Sun, 4 Sep 2022 23:17:13 +0800
Subject: pstore/zone: Use GFP_ATOMIC to allocate zone buffer

There is a case found when triggering a panic_on_oom, pstore fails to dump
kmsg. Because psz_kmsg_write_record can't get the new buffer.

Handle this by using GFP_ATOMIC to allocate a buffer at lower watermark.

Signed-off-by: Qiujun Huang <hqjagain@gmail.com>
Fixes: 335426c6dcdd ("pstore/zone: Provide way to skip "broken" zone for MTD devices")
Cc: WeiXiong Liao <gmpy.liaowx@gmail.com>
Cc: stable@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/CAJRQjofRCF7wjrYmw3D7zd5QZnwHQq+F8U-mJDJ6NZ4bddYdLA@mail.gmail.com
---
 fs/pstore/zone.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c
index 017d0d4ad329..2770746bb7aa 100644
--- a/fs/pstore/zone.c
+++ b/fs/pstore/zone.c
@@ -761,7 +761,7 @@ static inline int notrace psz_kmsg_write_record(struct psz_context *cxt,
 		/* avoid destroying old data, allocate a new one */
 		len = zone->buffer_size + sizeof(*zone->buffer);
 		zone->oldbuf = zone->buffer;
-		zone->buffer = kzalloc(len, GFP_KERNEL);
+		zone->buffer = kzalloc(len, GFP_ATOMIC);
 		if (!zone->buffer) {
 			zone->buffer = zone->oldbuf;
 			return -ENOMEM;
-- 
cgit 


From d85644dc5cf4023bf1a325f476d58a16bcc0798b Mon Sep 17 00:00:00 2001
From: "Guilherme G. Piccoli" <gpiccoli@igalia.com>
Date: Thu, 6 Oct 2022 19:42:05 -0300
Subject: pstore: Improve error reporting in case of backend overlap

The pstore infrastructure supports one single backend at a time;
trying to load a another backend causes an error and displays a
message, introduced on commit 0d7cd09a3dbb ("pstore: Improve
register_pstore() error reporting").

Happens that this message is not really clear about the situation,
also the current error returned (-EPERM) isn't accurate, whereas
-EBUSY makes more sense. We have another place in the code that
relies in the -EBUSY return for a similar check.

So, make it consistent here by returning -EBUSY and using a
similar message in both scenarios.

Signed-off-by: Guilherme G. Piccoli <gpiccoli@igalia.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20221006224212.569555-2-gpiccoli@igalia.com
---
 fs/pstore/platform.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 0c034ea39954..c32957e4b256 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -562,8 +562,9 @@ out:
 int pstore_register(struct pstore_info *psi)
 {
 	if (backend && strcmp(backend, psi->name)) {
-		pr_warn("ignoring unexpected backend '%s'\n", psi->name);
-		return -EPERM;
+		pr_warn("backend '%s' already in use: ignoring '%s'\n",
+			backend, psi->name);
+		return -EBUSY;
 	}
 
 	/* Sanity check flags. */
-- 
cgit 


From 8f5de3fd38b7e64112017bb1630f9df6e1b3331d Mon Sep 17 00:00:00 2001
From: "Guilherme G. Piccoli" <gpiccoli@igalia.com>
Date: Thu, 6 Oct 2022 19:42:06 -0300
Subject: pstore: Expose kmsg_bytes as a module parameter

Currently this tuning is only exposed as a filesystem option,
but most Linux distros automatically mount pstore, hence changing
this setting requires remounting it. Also, if that mount option
wasn't explicitly set it doesn't show up in mount information,
so users cannot check what is the current value of kmsg_bytes.

Let's then expose it as a module parameter, allowing both user
visibility at all times (even if not manually set) and also the
possibility of setting that as a boot/module parameter.

Signed-off-by: Guilherme G. Piccoli <gpiccoli@igalia.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20221006224212.569555-3-gpiccoli@igalia.com
---
 fs/pstore/platform.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index c32957e4b256..be05090076ce 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -89,6 +89,11 @@ static char *compress =
 module_param(compress, charp, 0444);
 MODULE_PARM_DESC(compress, "compression to use");
 
+/* How much of the kernel log to snapshot */
+unsigned long kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES;
+module_param(kmsg_bytes, ulong, 0444);
+MODULE_PARM_DESC(kmsg_bytes, "amount of kernel log to snapshot (in bytes)");
+
 /* Compression parameters */
 static struct crypto_comp *tfm;
 
@@ -100,9 +105,6 @@ struct pstore_zbackend {
 static char *big_oops_buf;
 static size_t big_oops_buf_sz;
 
-/* How much of the console log to snapshot */
-unsigned long kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES;
-
 void pstore_set_kmsg_bytes(int bytes)
 {
 	kmsg_bytes = bytes;
-- 
cgit 


From 6a14f1982e6b476c84938c9ca011d258a6bba24d Mon Sep 17 00:00:00 2001
From: "Guilherme G. Piccoli" <gpiccoli@igalia.com>
Date: Thu, 6 Oct 2022 19:42:07 -0300
Subject: pstore: Inform unregistered backend names as well

Currently we only show the registered ones in the kernel
log; users that change backend for some reason require
first to unregister the current one, so let's confirm this
operation with a message in the log.

Signed-off-by: Guilherme G. Piccoli <gpiccoli@igalia.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20221006224212.569555-4-gpiccoli@igalia.com
---
 fs/pstore/platform.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index be05090076ce..06c2c66af332 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -665,6 +665,8 @@ void pstore_unregister(struct pstore_info *psi)
 	psinfo = NULL;
 	kfree(backend);
 	backend = NULL;
+
+	pr_info("Unregistered %s as persistent store backend\n", psi->name);
 	mutex_unlock(&psinfo_lock);
 }
 EXPORT_SYMBOL_GPL(pstore_unregister);
-- 
cgit 


From 3219122b8cdd580f0793e803d4e80ecd7384cf17 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 11 Oct 2022 13:01:08 -0700
Subject: pstore/ram: Consolidate kfree() paths

There's no reason to keep separate kfree() paths: either all allocations
succeeded, or not. Everything is torn down in the case of failure, so
adjust the callers to reflect this.

Cc: Anton Vorontsov <anton@enomsg.org>
Cc: Colin Cross <ccross@android.com>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-and-tested-by: Guilherme G. Piccoli <gpiccoli@igalia.com>
Link: https://lore.kernel.org/r/20221011200112.731334-2-keescook@chromium.org
---
 fs/pstore/ram.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index fefe3d391d3a..348820a3ce88 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -451,6 +451,12 @@ static void ramoops_free_przs(struct ramoops_context *cxt)
 {
 	int i;
 
+	/* Free pmsg PRZ */
+	persistent_ram_free(cxt->mprz);
+
+	/* Free console PRZ */
+	persistent_ram_free(cxt->cprz);
+
 	/* Free dump PRZs */
 	if (cxt->dprzs) {
 		for (i = 0; i < cxt->max_dump_cnt; i++)
@@ -772,12 +778,12 @@ static int ramoops_probe(struct platform_device *pdev)
 				dump_mem_sz, cxt->record_size,
 				&cxt->max_dump_cnt, 0, 0);
 	if (err)
-		goto fail_out;
+		goto fail_init;
 
 	err = ramoops_init_prz("console", dev, cxt, &cxt->cprz, &paddr,
 			       cxt->console_size, 0);
 	if (err)
-		goto fail_init_cprz;
+		goto fail_init;
 
 	cxt->max_ftrace_cnt = (cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)
 				? nr_cpu_ids
@@ -788,12 +794,12 @@ static int ramoops_probe(struct platform_device *pdev)
 				(cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)
 					? PRZ_FLAG_NO_LOCK : 0);
 	if (err)
-		goto fail_init_fprz;
+		goto fail_init;
 
 	err = ramoops_init_prz("pmsg", dev, cxt, &cxt->mprz, &paddr,
 				cxt->pmsg_size, 0);
 	if (err)
-		goto fail_init_mprz;
+		goto fail_init;
 
 	cxt->pstore.data = cxt;
 	/*
@@ -857,11 +863,7 @@ fail_buf:
 	kfree(cxt->pstore.buf);
 fail_clear:
 	cxt->pstore.bufsize = 0;
-	persistent_ram_free(cxt->mprz);
-fail_init_mprz:
-fail_init_fprz:
-	persistent_ram_free(cxt->cprz);
-fail_init_cprz:
+fail_init:
 	ramoops_free_przs(cxt);
 fail_out:
 	return err;
@@ -876,8 +878,6 @@ static int ramoops_remove(struct platform_device *pdev)
 	kfree(cxt->pstore.buf);
 	cxt->pstore.bufsize = 0;
 
-	persistent_ram_free(cxt->mprz);
-	persistent_ram_free(cxt->cprz);
 	ramoops_free_przs(cxt);
 
 	return 0;
-- 
cgit 


From 6daf4e82bd54ef004ace4dd6deed60a32c282a95 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 11 Oct 2022 13:01:09 -0700
Subject: pstore/ram: Move pmsg init earlier

Since the ftrace area can vary in size based on CPU count, move pmsg
initialization earlier so it will have a stable location.

Suggested-by: Paramjit Oberoi <pso@chromium.org>
Cc: Anton Vorontsov <anton@enomsg.org>
Cc: Colin Cross <ccross@android.com>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed: Guilherme G. Piccoli <gpiccoli@igalia.com>
Link: https://lore.kernel.org/r/20221011200112.731334-3-keescook@chromium.org
---
 fs/pstore/ram.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 348820a3ce88..2f18563c8141 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -785,6 +785,11 @@ static int ramoops_probe(struct platform_device *pdev)
 	if (err)
 		goto fail_init;
 
+	err = ramoops_init_prz("pmsg", dev, cxt, &cxt->mprz, &paddr,
+				cxt->pmsg_size, 0);
+	if (err)
+		goto fail_init;
+
 	cxt->max_ftrace_cnt = (cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)
 				? nr_cpu_ids
 				: 1;
@@ -796,11 +801,6 @@ static int ramoops_probe(struct platform_device *pdev)
 	if (err)
 		goto fail_init;
 
-	err = ramoops_init_prz("pmsg", dev, cxt, &cxt->mprz, &paddr,
-				cxt->pmsg_size, 0);
-	if (err)
-		goto fail_init;
-
 	cxt->pstore.data = cxt;
 	/*
 	 * Prepare frontend flags based on which areas are initialized.
-- 
cgit 


From 8bd4da0f0626ae9a82099d3d99cd6efd4355879b Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 11 Oct 2022 13:01:10 -0700
Subject: pstore/ram: Move internal definitions out of kernel-wide include

Most of the details of the ram backend are entirely internal to the
backend itself. Leave only what is needed to instantiate a ram backend
in the kernel-wide header.

Cc: Anton Vorontsov <anton@enomsg.org>
Cc: Colin Cross <ccross@android.com>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-and-tested-by: Guilherme G. Piccoli <gpiccoli@igalia.com>
Link: https://lore.kernel.org/r/20221011200112.731334-4-keescook@chromium.org
---
 fs/pstore/ram.c          |  3 +-
 fs/pstore/ram_core.c     |  3 +-
 fs/pstore/ram_internal.h | 98 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 102 insertions(+), 2 deletions(-)
 create mode 100644 fs/pstore/ram_internal.h

(limited to 'fs')

diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 2f18563c8141..f5bf360cf905 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -18,10 +18,11 @@
 #include <linux/platform_device.h>
 #include <linux/slab.h>
 #include <linux/compiler.h>
-#include <linux/pstore_ram.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
+
 #include "internal.h"
+#include "ram_internal.h"
 
 #define RAMOOPS_KERNMSG_HDR "===="
 #define MIN_MEM_SIZE 4096UL
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index a89e33719fcf..9e1047f4316d 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -13,13 +13,14 @@
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/memblock.h>
-#include <linux/pstore_ram.h>
 #include <linux/rslib.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 #include <linux/vmalloc.h>
 #include <asm/page.h>
 
+#include "ram_internal.h"
+
 /**
  * struct persistent_ram_buffer - persistent circular RAM buffer
  *
diff --git a/fs/pstore/ram_internal.h b/fs/pstore/ram_internal.h
new file mode 100644
index 000000000000..440ee7a35e10
--- /dev/null
+++ b/fs/pstore/ram_internal.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2010 Marco Stornelli <marco.stornelli@gmail.com>
+ * Copyright (C) 2011 Kees Cook <keescook@chromium.org>
+ * Copyright (C) 2011 Google, Inc.
+ */
+
+#include <linux/pstore_ram.h>
+
+/*
+ * Choose whether access to the RAM zone requires locking or not.  If a zone
+ * can be written to from different CPUs like with ftrace for example, then
+ * PRZ_FLAG_NO_LOCK is used. For all other cases, locking is required.
+ */
+#define PRZ_FLAG_NO_LOCK	BIT(0)
+/*
+ * If a PRZ should only have a single-boot lifetime, this marks it as
+ * getting wiped after its contents get copied out after boot.
+ */
+#define PRZ_FLAG_ZAP_OLD	BIT(1)
+
+/**
+ * struct persistent_ram_zone - Details of a persistent RAM zone (PRZ)
+ *                              used as a pstore backend
+ *
+ * @paddr:	physical address of the mapped RAM area
+ * @size:	size of mapping
+ * @label:	unique name of this PRZ
+ * @type:	frontend type for this PRZ
+ * @flags:	holds PRZ_FLAGS_* bits
+ *
+ * @buffer_lock:
+ *	locks access to @buffer "size" bytes and "start" offset
+ * @buffer:
+ *	pointer to actual RAM area managed by this PRZ
+ * @buffer_size:
+ *	bytes in @buffer->data (not including any trailing ECC bytes)
+ *
+ * @par_buffer:
+ *	pointer into @buffer->data containing ECC bytes for @buffer->data
+ * @par_header:
+ *	pointer into @buffer->data containing ECC bytes for @buffer header
+ *	(i.e. all fields up to @data)
+ * @rs_decoder:
+ *	RSLIB instance for doing ECC calculations
+ * @corrected_bytes:
+ *	ECC corrected bytes accounting since boot
+ * @bad_blocks:
+ *	ECC uncorrectable bytes accounting since boot
+ * @ecc_info:
+ *	ECC configuration details
+ *
+ * @old_log:
+ *	saved copy of @buffer->data prior to most recent wipe
+ * @old_log_size:
+ *	bytes contained in @old_log
+ *
+ */
+struct persistent_ram_zone {
+	phys_addr_t paddr;
+	size_t size;
+	void *vaddr;
+	char *label;
+	enum pstore_type_id type;
+	u32 flags;
+
+	raw_spinlock_t buffer_lock;
+	struct persistent_ram_buffer *buffer;
+	size_t buffer_size;
+
+	char *par_buffer;
+	char *par_header;
+	struct rs_control *rs_decoder;
+	int corrected_bytes;
+	int bad_blocks;
+	struct persistent_ram_ecc_info ecc_info;
+
+	char *old_log;
+	size_t old_log_size;
+};
+
+struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
+			u32 sig, struct persistent_ram_ecc_info *ecc_info,
+			unsigned int memtype, u32 flags, char *label);
+void persistent_ram_free(struct persistent_ram_zone *prz);
+void persistent_ram_zap(struct persistent_ram_zone *prz);
+
+int persistent_ram_write(struct persistent_ram_zone *prz, const void *s,
+			 unsigned int count);
+int persistent_ram_write_user(struct persistent_ram_zone *prz,
+			      const void __user *s, unsigned int count);
+
+void persistent_ram_save_old(struct persistent_ram_zone *prz);
+size_t persistent_ram_old_size(struct persistent_ram_zone *prz);
+void *persistent_ram_old(struct persistent_ram_zone *prz);
+void persistent_ram_free_old(struct persistent_ram_zone *prz);
+ssize_t persistent_ram_ecc_string(struct persistent_ram_zone *prz,
+	char *str, size_t len);
-- 
cgit 


From 11c2a8700cdcabf9b639b7204a1e38e2a0b6798e Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 17 Oct 2022 17:06:34 +0200
Subject: attr: add in_group_or_capable()

In setattr_{copy,prepare}() we need to perform the same permission
checks to determine whether we need to drop the setgid bit or not.
Instead of open-coding it twice add a simple helper the encapsulates the
logic. We will reuse this helpers to make dropping the setgid bit during
write operations more consistent in a follow up patch.

Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/attr.c     | 10 +++++-----
 fs/inode.c    | 28 ++++++++++++++++++++++++----
 fs/internal.h |  2 ++
 3 files changed, 31 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/attr.c b/fs/attr.c
index 1552a5f23d6b..b1162fca84a2 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -18,6 +18,8 @@
 #include <linux/evm.h>
 #include <linux/ima.h>
 
+#include "internal.h"
+
 /**
  * chown_ok - verify permissions to chown inode
  * @mnt_userns:	user namespace of the mount @inode was found from
@@ -140,8 +142,7 @@ int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry,
 			vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
 
 		/* Also check the setgid bit! */
-		if (!vfsgid_in_group_p(vfsgid) &&
-		    !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
+		if (!in_group_or_capable(mnt_userns, inode, vfsgid))
 			attr->ia_mode &= ~S_ISGID;
 	}
 
@@ -251,9 +252,8 @@ void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode,
 		inode->i_ctime = attr->ia_ctime;
 	if (ia_valid & ATTR_MODE) {
 		umode_t mode = attr->ia_mode;
-		vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
-		if (!vfsgid_in_group_p(vfsgid) &&
-		    !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
+		if (!in_group_or_capable(mnt_userns, inode,
+					 i_gid_into_vfsgid(mnt_userns, inode)))
 			mode &= ~S_ISGID;
 		inode->i_mode = mode;
 	}
diff --git a/fs/inode.c b/fs/inode.c
index b608528efd3a..55299b710c45 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2487,6 +2487,28 @@ struct timespec64 current_time(struct inode *inode)
 }
 EXPORT_SYMBOL(current_time);
 
+/**
+ * in_group_or_capable - check whether caller is CAP_FSETID privileged
+ * @mnt_userns: user namespace of the mount @inode was found from
+ * @inode:	inode to check
+ * @vfsgid:	the new/current vfsgid of @inode
+ *
+ * Check wether @vfsgid is in the caller's group list or if the caller is
+ * privileged with CAP_FSETID over @inode. This can be used to determine
+ * whether the setgid bit can be kept or must be dropped.
+ *
+ * Return: true if the caller is sufficiently privileged, false if not.
+ */
+bool in_group_or_capable(struct user_namespace *mnt_userns,
+			 const struct inode *inode, vfsgid_t vfsgid)
+{
+	if (vfsgid_in_group_p(vfsgid))
+		return true;
+	if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
+		return true;
+	return false;
+}
+
 /**
  * mode_strip_sgid - handle the sgid bit for non-directories
  * @mnt_userns: User namespace of the mount the inode was created from
@@ -2508,11 +2530,9 @@ umode_t mode_strip_sgid(struct user_namespace *mnt_userns,
 		return mode;
 	if (S_ISDIR(mode) || !dir || !(dir->i_mode & S_ISGID))
 		return mode;
-	if (in_group_p(i_gid_into_mnt(mnt_userns, dir)))
-		return mode;
-	if (capable_wrt_inode_uidgid(mnt_userns, dir, CAP_FSETID))
+	if (in_group_or_capable(mnt_userns, dir,
+				i_gid_into_vfsgid(mnt_userns, dir)))
 		return mode;
-
 	return mode & ~S_ISGID;
 }
 EXPORT_SYMBOL(mode_strip_sgid);
diff --git a/fs/internal.h b/fs/internal.h
index 6f0386b34fae..1de39bbc9ddd 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -151,6 +151,8 @@ extern int vfs_open(const struct path *, struct file *);
  */
 extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
 extern int dentry_needs_remove_privs(struct dentry *dentry);
+bool in_group_or_capable(struct user_namespace *mnt_userns,
+			 const struct inode *inode, vfsgid_t vfsgid);
 
 /*
  * fs-writeback.c
-- 
cgit 


From e243e3f94c804ecca9a8241b5babe28f35258ef4 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 17 Oct 2022 17:06:35 +0200
Subject: fs: move should_remove_suid()

Move the helper from inode.c to attr.c. This keeps the the core of the
set{g,u}id stripping logic in one place when we add follow-up changes.
It is the better place anyway, since should_remove_suid() returns
ATTR_KILL_S{G,U}ID flags.

Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/attr.c  | 29 +++++++++++++++++++++++++++++
 fs/inode.c | 29 -----------------------------
 2 files changed, 29 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/attr.c b/fs/attr.c
index b1162fca84a2..e508b3caae76 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -20,6 +20,35 @@
 
 #include "internal.h"
 
+/*
+ * The logic we want is
+ *
+ *	if suid or (sgid and xgrp)
+ *		remove privs
+ */
+int should_remove_suid(struct dentry *dentry)
+{
+	umode_t mode = d_inode(dentry)->i_mode;
+	int kill = 0;
+
+	/* suid always must be killed */
+	if (unlikely(mode & S_ISUID))
+		kill = ATTR_KILL_SUID;
+
+	/*
+	 * sgid without any exec bits is just a mandatory locking mark; leave
+	 * it alone.  If some exec bits are set, it's a real sgid; kill it.
+	 */
+	if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
+		kill |= ATTR_KILL_SGID;
+
+	if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
+		return kill;
+
+	return 0;
+}
+EXPORT_SYMBOL(should_remove_suid);
+
 /**
  * chown_ok - verify permissions to chown inode
  * @mnt_userns:	user namespace of the mount @inode was found from
diff --git a/fs/inode.c b/fs/inode.c
index 55299b710c45..6df2b7c936c2 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1948,35 +1948,6 @@ skip_update:
 }
 EXPORT_SYMBOL(touch_atime);
 
-/*
- * The logic we want is
- *
- *	if suid or (sgid and xgrp)
- *		remove privs
- */
-int should_remove_suid(struct dentry *dentry)
-{
-	umode_t mode = d_inode(dentry)->i_mode;
-	int kill = 0;
-
-	/* suid always must be killed */
-	if (unlikely(mode & S_ISUID))
-		kill = ATTR_KILL_SUID;
-
-	/*
-	 * sgid without any exec bits is just a mandatory locking mark; leave
-	 * it alone.  If some exec bits are set, it's a real sgid; kill it.
-	 */
-	if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
-		kill |= ATTR_KILL_SGID;
-
-	if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
-		return kill;
-
-	return 0;
-}
-EXPORT_SYMBOL(should_remove_suid);
-
 /*
  * Return mask of changes for notify_change() that need to be done as a
  * response to write or truncate. Return 0 if nothing has to be changed.
-- 
cgit 


From 72ae017c5451860443a16fb2a8c243bff3e396b8 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 17 Oct 2022 17:06:36 +0200
Subject: attr: add setattr_should_drop_sgid()

The current setgid stripping logic during write and ownership change
operations is inconsistent and strewn over multiple places. In order to
consolidate it and make more consistent we'll add a new helper
setattr_should_drop_sgid(). The function retains the old behavior where
we remove the S_ISGID bit unconditionally when S_IXGRP is set but also
when it isn't set and the caller is neither in the group of the inode
nor privileged over the inode.

We will use this helper both in write operation permission removal such
as file_remove_privs() as well as in ownership change operations.

Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/attr.c     | 28 ++++++++++++++++++++++++++++
 fs/internal.h |  6 ++++++
 2 files changed, 34 insertions(+)

(limited to 'fs')

diff --git a/fs/attr.c b/fs/attr.c
index e508b3caae76..085322536127 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -20,6 +20,34 @@
 
 #include "internal.h"
 
+/**
+ * setattr_should_drop_sgid - determine whether the setgid bit needs to be
+ *                            removed
+ * @mnt_userns:	user namespace of the mount @inode was found from
+ * @inode:	inode to check
+ *
+ * This function determines whether the setgid bit needs to be removed.
+ * We retain backwards compatibility and require setgid bit to be removed
+ * unconditionally if S_IXGRP is set. Otherwise we have the exact same
+ * requirements as setattr_prepare() and setattr_copy().
+ *
+ * Return: ATTR_KILL_SGID if setgid bit needs to be removed, 0 otherwise.
+ */
+int setattr_should_drop_sgid(struct user_namespace *mnt_userns,
+			     const struct inode *inode)
+{
+	umode_t mode = inode->i_mode;
+
+	if (!(mode & S_ISGID))
+		return 0;
+	if (mode & S_IXGRP)
+		return ATTR_KILL_SGID;
+	if (!in_group_or_capable(mnt_userns, inode,
+				 i_gid_into_vfsgid(mnt_userns, inode)))
+		return ATTR_KILL_SGID;
+	return 0;
+}
+
 /*
  * The logic we want is
  *
diff --git a/fs/internal.h b/fs/internal.h
index 1de39bbc9ddd..771b0468d70c 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -236,3 +236,9 @@ int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 		struct xattr_ctx *ctx);
 
 ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos);
+
+/*
+ * fs/attr.c
+ */
+int setattr_should_drop_sgid(struct user_namespace *mnt_userns,
+			     const struct inode *inode);
-- 
cgit 


From ed5a7047d2011cb6b2bf84ceb6680124cc6a7d95 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 17 Oct 2022 17:06:37 +0200
Subject: attr: use consistent sgid stripping checks

Currently setgid stripping in file_remove_privs()'s should_remove_suid()
helper is inconsistent with other parts of the vfs. Specifically, it only
raises ATTR_KILL_SGID if the inode is S_ISGID and S_IXGRP but not if the
inode isn't in the caller's groups and the caller isn't privileged over the
inode although we require this already in setattr_prepare() and
setattr_copy() and so all filesystem implement this requirement implicitly
because they have to use setattr_{prepare,copy}() anyway.

But the inconsistency shows up in setgid stripping bugs for overlayfs in
xfstests (e.g., generic/673, generic/683, generic/685, generic/686,
generic/687). For example, we test whether suid and setgid stripping works
correctly when performing various write-like operations as an unprivileged
user (fallocate, reflink, write, etc.):

echo "Test 1 - qa_user, non-exec file $verb"
setup_testfile
chmod a+rws $junk_file
commit_and_check "$qa_user" "$verb" 64k 64k

The test basically creates a file with 6666 permissions. While the file has
the S_ISUID and S_ISGID bits set it does not have the S_IXGRP set. On a
regular filesystem like xfs what will happen is:

sys_fallocate()
-> vfs_fallocate()
   -> xfs_file_fallocate()
      -> file_modified()
         -> __file_remove_privs()
            -> dentry_needs_remove_privs()
               -> should_remove_suid()
            -> __remove_privs()
               newattrs.ia_valid = ATTR_FORCE | kill;
               -> notify_change()
                  -> setattr_copy()

In should_remove_suid() we can see that ATTR_KILL_SUID is raised
unconditionally because the file in the test has S_ISUID set.

But we also see that ATTR_KILL_SGID won't be set because while the file
is S_ISGID it is not S_IXGRP (see above) which is a condition for
ATTR_KILL_SGID being raised.

So by the time we call notify_change() we have attr->ia_valid set to
ATTR_KILL_SUID | ATTR_FORCE. Now notify_change() sees that
ATTR_KILL_SUID is set and does:

ia_valid = attr->ia_valid |= ATTR_MODE
attr->ia_mode = (inode->i_mode & ~S_ISUID);

which means that when we call setattr_copy() later we will definitely
update inode->i_mode. Note that attr->ia_mode still contains S_ISGID.

Now we call into the filesystem's ->setattr() inode operation which will
end up calling setattr_copy(). Since ATTR_MODE is set we will hit:

if (ia_valid & ATTR_MODE) {
        umode_t mode = attr->ia_mode;
        vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
        if (!vfsgid_in_group_p(vfsgid) &&
            !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
                mode &= ~S_ISGID;
        inode->i_mode = mode;
}

and since the caller in the test is neither capable nor in the group of the
inode the S_ISGID bit is stripped.

But assume the file isn't suid then ATTR_KILL_SUID won't be raised which
has the consequence that neither the setgid nor the suid bits are stripped
even though it should be stripped because the inode isn't in the caller's
groups and the caller isn't privileged over the inode.

If overlayfs is in the mix things become a bit more complicated and the bug
shows up more clearly. When e.g., ovl_setattr() is hit from
ovl_fallocate()'s call to file_remove_privs() then ATTR_KILL_SUID and
ATTR_KILL_SGID might be raised but because the check in notify_change() is
questioning the ATTR_KILL_SGID flag again by requiring S_IXGRP for it to be
stripped the S_ISGID bit isn't removed even though it should be stripped:

sys_fallocate()
-> vfs_fallocate()
   -> ovl_fallocate()
      -> file_remove_privs()
         -> dentry_needs_remove_privs()
            -> should_remove_suid()
         -> __remove_privs()
            newattrs.ia_valid = ATTR_FORCE | kill;
            -> notify_change()
               -> ovl_setattr()
                  // TAKE ON MOUNTER'S CREDS
                  -> ovl_do_notify_change()
                     -> notify_change()
                  // GIVE UP MOUNTER'S CREDS
     // TAKE ON MOUNTER'S CREDS
     -> vfs_fallocate()
        -> xfs_file_fallocate()
           -> file_modified()
              -> __file_remove_privs()
                 -> dentry_needs_remove_privs()
                    -> should_remove_suid()
                 -> __remove_privs()
                    newattrs.ia_valid = attr_force | kill;
                    -> notify_change()

The fix for all of this is to make file_remove_privs()'s
should_remove_suid() helper to perform the same checks as we already
require in setattr_prepare() and setattr_copy() and have notify_change()
not pointlessly requiring S_IXGRP again. It doesn't make any sense in the
first place because the caller must calculate the flags via
should_remove_suid() anyway which would raise ATTR_KILL_SGID.

While we're at it we move should_remove_suid() from inode.c to attr.c
where it belongs with the rest of the iattr helpers. Especially since it
returns ATTR_KILL_S{G,U}ID flags. We also rename it to
setattr_should_drop_suidgid() to better reflect that it indicates both
setuid and setgid bit removal and also that it returns attr flags.

Running xfstests with this doesn't report any regressions. We should really
try and use consistent checks.

Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/attr.c       | 33 +++++++++++++++++++--------------
 fs/fuse/file.c  |  2 +-
 fs/inode.c      |  7 ++++---
 fs/internal.h   |  2 +-
 fs/ocfs2/file.c |  4 ++--
 fs/open.c       |  8 ++++----
 6 files changed, 31 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/attr.c b/fs/attr.c
index 085322536127..b45f30e516fa 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -48,34 +48,39 @@ int setattr_should_drop_sgid(struct user_namespace *mnt_userns,
 	return 0;
 }
 
-/*
- * The logic we want is
+/**
+ * setattr_should_drop_suidgid - determine whether the set{g,u}id bit needs to
+ *                               be dropped
+ * @mnt_userns:	user namespace of the mount @inode was found from
+ * @inode:	inode to check
  *
- *	if suid or (sgid and xgrp)
- *		remove privs
+ * This function determines whether the set{g,u}id bits need to be removed.
+ * If the setuid bit needs to be removed ATTR_KILL_SUID is returned. If the
+ * setgid bit needs to be removed ATTR_KILL_SGID is returned. If both
+ * set{g,u}id bits need to be removed the corresponding mask of both flags is
+ * returned.
+ *
+ * Return: A mask of ATTR_KILL_S{G,U}ID indicating which - if any - setid bits
+ * to remove, 0 otherwise.
  */
-int should_remove_suid(struct dentry *dentry)
+int setattr_should_drop_suidgid(struct user_namespace *mnt_userns,
+				struct inode *inode)
 {
-	umode_t mode = d_inode(dentry)->i_mode;
+	umode_t mode = inode->i_mode;
 	int kill = 0;
 
 	/* suid always must be killed */
 	if (unlikely(mode & S_ISUID))
 		kill = ATTR_KILL_SUID;
 
-	/*
-	 * sgid without any exec bits is just a mandatory locking mark; leave
-	 * it alone.  If some exec bits are set, it's a real sgid; kill it.
-	 */
-	if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
-		kill |= ATTR_KILL_SGID;
+	kill |= setattr_should_drop_sgid(mnt_userns, inode);
 
 	if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
 		return kill;
 
 	return 0;
 }
-EXPORT_SYMBOL(should_remove_suid);
+EXPORT_SYMBOL(setattr_should_drop_suidgid);
 
 /**
  * chown_ok - verify permissions to chown inode
@@ -432,7 +437,7 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry,
 		}
 	}
 	if (ia_valid & ATTR_KILL_SGID) {
-		if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
+		if (mode & S_ISGID) {
 			if (!(ia_valid & ATTR_MODE)) {
 				ia_valid = attr->ia_valid |= ATTR_MODE;
 				attr->ia_mode = inode->i_mode;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 1a3afd469e3a..97e2d815075d 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1313,7 +1313,7 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
 			return err;
 
 		if (fc->handle_killpriv_v2 &&
-		    should_remove_suid(file_dentry(file))) {
+		    setattr_should_drop_suidgid(&init_user_ns, file_inode(file))) {
 			goto writethrough;
 		}
 
diff --git a/fs/inode.c b/fs/inode.c
index 6df2b7c936c2..8c4078889754 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1953,7 +1953,8 @@ EXPORT_SYMBOL(touch_atime);
  * response to write or truncate. Return 0 if nothing has to be changed.
  * Negative value on error (change should be denied).
  */
-int dentry_needs_remove_privs(struct dentry *dentry)
+int dentry_needs_remove_privs(struct user_namespace *mnt_userns,
+			      struct dentry *dentry)
 {
 	struct inode *inode = d_inode(dentry);
 	int mask = 0;
@@ -1962,7 +1963,7 @@ int dentry_needs_remove_privs(struct dentry *dentry)
 	if (IS_NOSEC(inode))
 		return 0;
 
-	mask = should_remove_suid(dentry);
+	mask = setattr_should_drop_suidgid(mnt_userns, inode);
 	ret = security_inode_need_killpriv(dentry);
 	if (ret < 0)
 		return ret;
@@ -1994,7 +1995,7 @@ static int __file_remove_privs(struct file *file, unsigned int flags)
 	if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode))
 		return 0;
 
-	kill = dentry_needs_remove_privs(dentry);
+	kill = dentry_needs_remove_privs(file_mnt_user_ns(file), dentry);
 	if (kill < 0)
 		return kill;
 
diff --git a/fs/internal.h b/fs/internal.h
index 771b0468d70c..5545c26d86ae 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -150,7 +150,7 @@ extern int vfs_open(const struct path *, struct file *);
  * inode.c
  */
 extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
-extern int dentry_needs_remove_privs(struct dentry *dentry);
+int dentry_needs_remove_privs(struct user_namespace *, struct dentry *dentry);
 bool in_group_or_capable(struct user_namespace *mnt_userns,
 			 const struct inode *inode, vfsgid_t vfsgid);
 
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 9c67edd215d5..4d78e0979517 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1991,7 +1991,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
 		}
 	}
 
-	if (file && should_remove_suid(file->f_path.dentry)) {
+	if (file && setattr_should_drop_suidgid(&init_user_ns, file_inode(file))) {
 		ret = __ocfs2_write_remove_suid(inode, di_bh);
 		if (ret) {
 			mlog_errno(ret);
@@ -2279,7 +2279,7 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
 		 * inode. There's also the dinode i_size state which
 		 * can be lost via setattr during extending writes (we
 		 * set inode->i_size at the end of a write. */
-		if (should_remove_suid(dentry)) {
+		if (setattr_should_drop_suidgid(&init_user_ns, inode)) {
 			if (meta_level == 0) {
 				ocfs2_inode_unlock_for_extent_tree(inode,
 								   &di_bh,
diff --git a/fs/open.c b/fs/open.c
index a81319b6177f..9d0197db15e7 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -54,7 +54,7 @@ int do_truncate(struct user_namespace *mnt_userns, struct dentry *dentry,
 	}
 
 	/* Remove suid, sgid, and file capabilities on truncate too */
-	ret = dentry_needs_remove_privs(dentry);
+	ret = dentry_needs_remove_privs(mnt_userns, dentry);
 	if (ret < 0)
 		return ret;
 	if (ret)
@@ -723,10 +723,10 @@ retry_deleg:
 		return -EINVAL;
 	if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid))
 		return -EINVAL;
-	if (!S_ISDIR(inode->i_mode))
-		newattrs.ia_valid |=
-			ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
 	inode_lock(inode);
+	if (!S_ISDIR(inode->i_mode))
+		newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV |
+				     setattr_should_drop_sgid(mnt_userns, inode);
 	/* Continue to send actual fs values, not the mount values. */
 	error = security_path_chown(
 		path,
-- 
cgit 


From b306e90ffabdaa7e3b3350dbcd19b7663e71ab17 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Mon, 17 Oct 2022 17:06:38 +0200
Subject: ovl: remove privs in ovl_copyfile()

Underlying fs doesn't remove privs because copy_range/remap_range are
called with privileged mounter credentials.

This fixes some failures in fstest generic/673.

Fixes: 8ede205541ff ("ovl: add reflink/copyfile/dedup support")
Acked-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/overlayfs/file.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index a1a22f58ba18..755a11c63596 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -567,14 +567,23 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in,
 	const struct cred *old_cred;
 	loff_t ret;
 
+	inode_lock(inode_out);
+	if (op != OVL_DEDUPE) {
+		/* Update mode */
+		ovl_copyattr(inode_out);
+		ret = file_remove_privs(file_out);
+		if (ret)
+			goto out_unlock;
+	}
+
 	ret = ovl_real_fdget(file_out, &real_out);
 	if (ret)
-		return ret;
+		goto out_unlock;
 
 	ret = ovl_real_fdget(file_in, &real_in);
 	if (ret) {
 		fdput(real_out);
-		return ret;
+		goto out_unlock;
 	}
 
 	old_cred = ovl_override_creds(file_inode(file_out)->i_sb);
@@ -603,6 +612,9 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in,
 	fdput(real_in);
 	fdput(real_out);
 
+out_unlock:
+	inode_unlock(inode_out);
+
 	return ret;
 }
 
-- 
cgit 


From 23a8ce16419a3066829ad4a8b7032a75817af65b Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Mon, 17 Oct 2022 17:06:39 +0200
Subject: ovl: remove privs in ovl_fallocate()

Underlying fs doesn't remove privs because fallocate is called with
privileged mounter credentials.

This fixes some failure in fstests generic/683..687.

Fixes: aab8848cee5e ("ovl: add ovl_fallocate()")
Acked-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/overlayfs/file.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 755a11c63596..d066be3b9226 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -517,9 +517,16 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len
 	const struct cred *old_cred;
 	int ret;
 
+	inode_lock(inode);
+	/* Update mode */
+	ovl_copyattr(inode);
+	ret = file_remove_privs(file);
+	if (ret)
+		goto out_unlock;
+
 	ret = ovl_real_fdget(file, &real);
 	if (ret)
-		return ret;
+		goto out_unlock;
 
 	old_cred = ovl_override_creds(file_inode(file)->i_sb);
 	ret = vfs_fallocate(real.file, mode, offset, len);
@@ -530,6 +537,9 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len
 
 	fdput(real);
 
+out_unlock:
+	inode_unlock(inode);
+
 	return ret;
 }
 
-- 
cgit 


From 898f706695682b9954f280d95e49fa86ffa55d08 Mon Sep 17 00:00:00 2001
From: Dongliang Mu <mudongliangabcd@gmail.com>
Date: Tue, 18 Oct 2022 08:48:07 -0500
Subject: fs: jfs: fix shift-out-of-bounds in dbAllocAG

Syzbot found a crash : UBSAN: shift-out-of-bounds in dbAllocAG. The
underlying bug is the missing check of bmp->db_agl2size. The field can
be greater than 64 and trigger the shift-out-of-bounds.

Fix this bug by adding a check of bmp->db_agl2size in dbMount since this
field is used in many following functions. The upper bound for this
field is L2MAXL2SIZE - L2MAXAG, thanks for the help of Dave Kleikamp.
Note that, for maintenance, I reorganized error handling code of dbMount.

Reported-by: syzbot+15342c1aa6a00fb7a438@syzkaller.appspotmail.com
Signed-off-by: Dongliang Mu <mudongliangabcd@gmail.com>
Signed-off-by: Dave Kleikamp <dave.kleikamp@oracle.com>
---
 fs/jfs/jfs_dmap.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 6b838d3ae7c2..e1cbfbb60303 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -155,7 +155,7 @@ int dbMount(struct inode *ipbmap)
 	struct bmap *bmp;
 	struct dbmap_disk *dbmp_le;
 	struct metapage *mp;
-	int i;
+	int i, err;
 
 	/*
 	 * allocate/initialize the in-memory bmap descriptor
@@ -170,8 +170,8 @@ int dbMount(struct inode *ipbmap)
 			   BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage,
 			   PSIZE, 0);
 	if (mp == NULL) {
-		kfree(bmp);
-		return -EIO;
+		err = -EIO;
+		goto err_kfree_bmp;
 	}
 
 	/* copy the on-disk bmap descriptor to its in-memory version. */
@@ -181,9 +181,8 @@ int dbMount(struct inode *ipbmap)
 	bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage);
 	bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag);
 	if (!bmp->db_numag) {
-		release_metapage(mp);
-		kfree(bmp);
-		return -EINVAL;
+		err = -EINVAL;
+		goto err_release_metapage;
 	}
 
 	bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel);
@@ -194,6 +193,11 @@ int dbMount(struct inode *ipbmap)
 	bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth);
 	bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart);
 	bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size);
+	if (bmp->db_agl2size > L2MAXL2SIZE - L2MAXAG) {
+		err = -EINVAL;
+		goto err_release_metapage;
+	}
+
 	for (i = 0; i < MAXAG; i++)
 		bmp->db_agfree[i] = le64_to_cpu(dbmp_le->dn_agfree[i]);
 	bmp->db_agsize = le64_to_cpu(dbmp_le->dn_agsize);
@@ -214,6 +218,12 @@ int dbMount(struct inode *ipbmap)
 	BMAP_LOCK_INIT(bmp);
 
 	return (0);
+
+err_release_metapage:
+	release_metapage(mp);
+err_kfree_bmp:
+	kfree(bmp);
+	return err;
 }
 
 
-- 
cgit 


From 73c6da327ff17a07a9260b0ff1f36e13665ac63d Mon Sep 17 00:00:00 2001
From: Jiangshan Yi <yijiangshan@kylinos.cn>
Date: Thu, 14 Jul 2022 10:56:56 +0800
Subject: fs/jfs: replace ternary operator with min_t()

Fix the following coccicheck warning:

fs/jfs/super.c:748: WARNING opportunity for min().
fs/jfs/super.c:788: WARNING opportunity for min().

min_t() macro is defined in include/linux/minmax.h. It avoids
multiple evaluations of the arguments when non-constant and performs
strict type-checking.

Signed-off-by: Jiangshan Yi <yijiangshan@kylinos.cn>
Signed-off-by: Dave Kleikamp <dave.kleikamp@oracle.com>
---
 fs/jfs/super.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 85d4f44f2ac4..d2f82cb7db1b 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -745,8 +745,7 @@ static ssize_t jfs_quota_read(struct super_block *sb, int type, char *data,
 		len = i_size-off;
 	toread = len;
 	while (toread > 0) {
-		tocopy = sb->s_blocksize - offset < toread ?
-				sb->s_blocksize - offset : toread;
+		tocopy = min_t(size_t, sb->s_blocksize - offset, toread);
 
 		tmp_bh.b_state = 0;
 		tmp_bh.b_size = i_blocksize(inode);
@@ -785,8 +784,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type,
 
 	inode_lock(inode);
 	while (towrite > 0) {
-		tocopy = sb->s_blocksize - offset < towrite ?
-				sb->s_blocksize - offset : towrite;
+		tocopy = min_t(size_t, sb->s_blocksize - offset, towrite);
 
 		tmp_bh.b_state = 0;
 		tmp_bh.b_size = i_blocksize(inode);
-- 
cgit 


From b0a35efa0ebc50032ab81d60ed5a12de4324c5e2 Mon Sep 17 00:00:00 2001
From: Jiangshan Yi <yijiangshan@kylinos.cn>
Date: Fri, 2 Sep 2022 16:53:38 +0800
Subject: fs/jfs/jfs_xattr.h: Fix spelling typo in comment

Fix spelling typo in comment.

Reported-by: k2ci <kernel-bot@kylinos.cn>
Signed-off-by: Jiangshan Yi <yijiangshan@kylinos.cn>
Signed-off-by: Dave Kleikamp <dave.kleikamp@oracle.com>
---
 fs/jfs/jfs_xattr.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h
index c50167a7bc50..0d33816d251d 100644
--- a/fs/jfs/jfs_xattr.h
+++ b/fs/jfs/jfs_xattr.h
@@ -25,7 +25,7 @@ struct jfs_ea_list {
 	struct jfs_ea ea[];	/* Variable length list */
 };
 
-/* Macros for defining maxiumum number of bytes supported for EAs */
+/* Macros for defining maximum number of bytes supported for EAs */
 #define MAXEASIZE	65535
 #define MAXEALISTSIZE	MAXEASIZE
 
-- 
cgit 


From 1ea66d71b17608b49ddb907f6155bc47c6e33506 Mon Sep 17 00:00:00 2001
From: Gaosheng Cui <cuigaosheng1@huawei.com>
Date: Fri, 9 Sep 2022 14:50:55 +0800
Subject: jfs: remove unused declarations for jfs

extRealloc(), xtRelocate(), xtDelete() and extFill() have been
removed since commit e471e5942c00 ("fs/jfs: Remove dead code"),
so remove them.

Signed-off-by: Gaosheng Cui <cuigaosheng1@huawei.com>
Signed-off-by: Dave Kleikamp <dave.kleikamp@oracle.com>
---
 fs/jfs/jfs_extent.h | 2 --
 fs/jfs/jfs_xtree.h  | 4 ----
 2 files changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/jfs/jfs_extent.h b/fs/jfs/jfs_extent.h
index 1c984214e95e..a0ee4ccea66e 100644
--- a/fs/jfs/jfs_extent.h
+++ b/fs/jfs/jfs_extent.h
@@ -10,9 +10,7 @@
 	(addressPXD(&(JFS_IP(ip)->ixpxd)) + lengthPXD(&(JFS_IP(ip)->ixpxd)) - 1)
 
 extern int	extAlloc(struct inode *, s64, s64, xad_t *, bool);
-extern int	extFill(struct inode *, xad_t *);
 extern int	extHint(struct inode *, s64, xad_t *);
-extern int	extRealloc(struct inode *, s64, xad_t *, bool);
 extern int	extRecord(struct inode *, xad_t *);
 
 #endif	/* _H_JFS_EXTENT */
diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h
index 142caafc73b1..ad7592191d76 100644
--- a/fs/jfs/jfs_xtree.h
+++ b/fs/jfs/jfs_xtree.h
@@ -96,12 +96,8 @@ extern int xtInsert(tid_t tid, struct inode *ip,
 extern int xtExtend(tid_t tid, struct inode *ip, s64 xoff, int xlen,
 		    int flag);
 extern int xtUpdate(tid_t tid, struct inode *ip, struct xad *nxad);
-extern int xtDelete(tid_t tid, struct inode *ip, s64 xoff, int xlen,
-		    int flag);
 extern s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int type);
 extern s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size);
-extern int xtRelocate(tid_t tid, struct inode *ip,
-		      xad_t * oxad, s64 nxaddr, int xtype);
 extern int xtAppend(tid_t tid,
 		    struct inode *ip, int xflag, s64 xoff, int maxblocks,
 		    int *xlenp, s64 * xaddrp, int flag);
-- 
cgit 


From dee8744524096514109a9680a9c45a4259dae2a4 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Tue, 18 Oct 2022 16:23:10 +0100
Subject: jfs: remove redundant assignments to ipaimap and ipaimap2

The pointers ipaimap and ipaimap2 are re-assigned with values a second
time with the same values when they were initialized. The re-assignments
are redundant and can be removed.

Cleans up two clang scan build warnings:
fs/jfs/jfs_umount.c:42:16: warning: Value stored to 'ipaimap' during
its initialization is never read [deadcode.DeadStores]

fs/jfs/jfs_umount.c:43:16: warning: Value stored to 'ipaimap2' during
its initialization is never read [deadcode.DeadStores]

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Signed-off-by: Dave Kleikamp <dave.kleikamp@oracle.com>
---
 fs/jfs/jfs_umount.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c
index 3e8b13e6aa01..95ebcd17ce75 100644
--- a/fs/jfs/jfs_umount.c
+++ b/fs/jfs/jfs_umount.c
@@ -68,7 +68,6 @@ int jfs_umount(struct super_block *sb)
 	/*
 	 * close secondary aggregate inode allocation map
 	 */
-	ipaimap2 = sbi->ipaimap2;
 	if (ipaimap2) {
 		diUnmount(ipaimap2, 0);
 		diFreeSpecial(ipaimap2);
@@ -78,7 +77,6 @@ int jfs_umount(struct super_block *sb)
 	/*
 	 * close aggregate inode allocation map
 	 */
-	ipaimap = sbi->ipaimap;
 	diUnmount(ipaimap, 0);
 	diFreeSpecial(ipaimap);
 	sbi->ipaimap = NULL;
-- 
cgit 


From 3350607dc5637be2563f484dcfe2fed456f3d4ff Mon Sep 17 00:00:00 2001
From: Günther Noack <gnoack3000@gmail.com>
Date: Tue, 18 Oct 2022 20:22:06 +0200
Subject: security: Create file_truncate hook from path_truncate hook
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Like path_truncate, the file_truncate hook also restricts file
truncation, but is called in the cases where truncation is attempted
on an already-opened file.

This is required in a subsequent commit to handle ftruncate()
operations differently to truncate() operations.

Acked-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: John Johansen <john.johansen@canonical.com>
Acked-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: Günther Noack <gnoack3000@gmail.com>
Link: https://lore.kernel.org/r/20221018182216.301684-2-gnoack3000@gmail.com
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 fs/namei.c | 2 +-
 fs/open.c  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 578c2110df02..85e9b9cc5c38 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3211,7 +3211,7 @@ static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp)
 	if (error)
 		return error;
 
-	error = security_path_truncate(path);
+	error = security_file_truncate(filp);
 	if (!error) {
 		error = do_truncate(mnt_userns, path->dentry, 0,
 				    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
diff --git a/fs/open.c b/fs/open.c
index a81319b6177f..c92f76ab341a 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -188,7 +188,7 @@ long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
 	if (IS_APPEND(file_inode(f.file)))
 		goto out_putf;
 	sb_start_write(inode->i_sb);
-	error = security_path_truncate(&f.file->f_path);
+	error = security_file_truncate(f.file);
 	if (!error)
 		error = do_truncate(file_mnt_user_ns(f.file), dentry, length,
 				    ATTR_MTIME | ATTR_CTIME, f.file);
-- 
cgit 


From 4053d2500beb0ca1e0757665af9e31da249a7a52 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 23 Sep 2022 10:19:34 +0200
Subject: orangefs: rework posix acl handling when creating new filesystem
 objects

When creating new filesytem objects orangefs used to create posix acls
after it had created and inserted a new inode. This made it necessary to
all posix_acl_chmod() on the newly created inode in case the mode of the
inode would be changed by the posix acls.

Instead of doing it this way calculate the correct mode directly before
actually creating the inode. So we first create posix acls, then pass
the mode that posix acls mandate into the orangefs getattr helper and
calculate the correct mode. This is needed so we can simply change
posix_acl_chmod() to take a dentry instead of an inode argument in the
next patch.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/orangefs/acl.c             | 44 ++-----------------------------------------
 fs/orangefs/inode.c           | 44 +++++++++++++++++++++++++++++++++++--------
 fs/orangefs/orangefs-kernel.h |  6 ++++--
 fs/orangefs/orangefs-utils.c  | 10 ++++++++--
 4 files changed, 50 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c
index 605e5a3506ec..0e2db840c217 100644
--- a/fs/orangefs/acl.c
+++ b/fs/orangefs/acl.c
@@ -64,8 +64,7 @@ struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu)
 	return acl;
 }
 
-static int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl,
-			      int type)
+int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
 	int error = 0;
 	void *value = NULL;
@@ -153,46 +152,7 @@ int orangefs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
 	rc = __orangefs_set_acl(inode, acl, type);
 
 	if (!rc && (iattr.ia_valid == ATTR_MODE))
-		rc = __orangefs_setattr(inode, &iattr);
+		rc = __orangefs_setattr_mode(inode, &iattr);
 
 	return rc;
 }
-
-int orangefs_init_acl(struct inode *inode, struct inode *dir)
-{
-	struct posix_acl *default_acl, *acl;
-	umode_t mode = inode->i_mode;
-	struct iattr iattr;
-	int error = 0;
-
-	error = posix_acl_create(dir, &mode, &default_acl, &acl);
-	if (error)
-		return error;
-
-	if (default_acl) {
-		error = __orangefs_set_acl(inode, default_acl,
-					   ACL_TYPE_DEFAULT);
-		posix_acl_release(default_acl);
-	} else {
-		inode->i_default_acl = NULL;
-	}
-
-	if (acl) {
-		if (!error)
-			error = __orangefs_set_acl(inode, acl, ACL_TYPE_ACCESS);
-		posix_acl_release(acl);
-	} else {
-		inode->i_acl = NULL;
-	}
-
-	/* If mode of the inode was changed, then do a forcible ->setattr */
-	if (mode != inode->i_mode) {
-		memset(&iattr, 0, sizeof iattr);
-		inode->i_mode = mode;
-		iattr.ia_mode = mode;
-		iattr.ia_valid |= ATTR_MODE;
-		__orangefs_setattr(inode, &iattr);
-	}
-
-	return error;
-}
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 7a8c0c6e698d..35788cde6d24 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -828,15 +828,22 @@ again:
 	spin_unlock(&inode->i_lock);
 	mark_inode_dirty(inode);
 
-	if (iattr->ia_valid & ATTR_MODE)
-		/* change mod on a file that has ACLs */
-		ret = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
-
 	ret = 0;
 out:
 	return ret;
 }
 
+int __orangefs_setattr_mode(struct inode *inode, struct iattr *iattr)
+{
+	int ret;
+
+	ret = __orangefs_setattr(inode, iattr);
+	/* change mode on a file that has ACLs */
+	if (!ret && (iattr->ia_valid & ATTR_MODE))
+		ret = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
+	return ret;
+}
+
 /*
  * Change attributes of an object referenced by dentry.
  */
@@ -849,7 +856,7 @@ int orangefs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 	ret = setattr_prepare(&init_user_ns, dentry, iattr);
 	if (ret)
 	        goto out;
-	ret = __orangefs_setattr(d_inode(dentry), iattr);
+	ret = __orangefs_setattr_mode(d_inode(dentry), iattr);
 	sync_inode_metadata(d_inode(dentry), 1);
 out:
 	gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_setattr: returning %d\n",
@@ -1097,8 +1104,9 @@ struct inode *orangefs_iget(struct super_block *sb,
  * Allocate an inode for a newly created file and insert it into the inode hash.
  */
 struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
-		int mode, dev_t dev, struct orangefs_object_kref *ref)
+		umode_t mode, dev_t dev, struct orangefs_object_kref *ref)
 {
+	struct posix_acl *acl = NULL, *default_acl = NULL;
 	unsigned long hash = orangefs_handle_hash(ref);
 	struct inode *inode;
 	int error;
@@ -1115,16 +1123,33 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 
+	error = posix_acl_create(dir, &mode, &default_acl, &acl);
+	if (error)
+		goto out_iput;
+
 	orangefs_set_inode(inode, ref);
 	inode->i_ino = hash;	/* needed for stat etc */
 
-	error = orangefs_inode_getattr(inode, ORANGEFS_GETATTR_NEW);
+	error = __orangefs_inode_getattr(inode, mode, ORANGEFS_GETATTR_NEW);
 	if (error)
 		goto out_iput;
 
 	orangefs_init_iops(inode);
 	inode->i_rdev = dev;
 
+	if (default_acl) {
+		error = __orangefs_set_acl(inode, default_acl,
+					   ACL_TYPE_DEFAULT);
+		if (error)
+			goto out_iput;
+	}
+
+	if (acl) {
+		error = __orangefs_set_acl(inode, acl, ACL_TYPE_ACCESS);
+		if (error)
+			goto out_iput;
+	}
+
 	error = insert_inode_locked4(inode, hash, orangefs_test_inode, ref);
 	if (error < 0)
 		goto out_iput;
@@ -1132,10 +1157,13 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
 	gossip_debug(GOSSIP_INODE_DEBUG,
 		     "Initializing ACL's for inode %pU\n",
 		     get_khandle_from_ino(inode));
-	orangefs_init_acl(inode, dir);
+	posix_acl_release(acl);
+	posix_acl_release(default_acl);
 	return inode;
 
 out_iput:
 	iput(inode);
+	posix_acl_release(acl);
+	posix_acl_release(default_acl);
 	return ERR_PTR(error);
 }
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index b5940ec1836a..3298b15684b7 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -103,13 +103,13 @@ enum orangefs_vfs_op_states {
 #define ORANGEFS_CACHE_CREATE_FLAGS 0
 #endif
 
-extern int orangefs_init_acl(struct inode *inode, struct inode *dir);
 extern const struct xattr_handler *orangefs_xattr_handlers[];
 
 extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu);
 extern int orangefs_set_acl(struct user_namespace *mnt_userns,
 			    struct inode *inode, struct posix_acl *acl,
 			    int type);
+int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
 
 /*
  * orangefs data structures
@@ -356,11 +356,12 @@ void fsid_key_table_finalize(void);
 vm_fault_t orangefs_page_mkwrite(struct vm_fault *);
 struct inode *orangefs_new_inode(struct super_block *sb,
 			      struct inode *dir,
-			      int mode,
+			      umode_t mode,
 			      dev_t dev,
 			      struct orangefs_object_kref *ref);
 
 int __orangefs_setattr(struct inode *, struct iattr *);
+int __orangefs_setattr_mode(struct inode *inode, struct iattr *iattr);
 int orangefs_setattr(struct user_namespace *, struct dentry *, struct iattr *);
 
 int orangefs_getattr(struct user_namespace *mnt_userns, const struct path *path,
@@ -422,6 +423,7 @@ int orangefs_inode_setxattr(struct inode *inode,
 #define ORANGEFS_GETATTR_SIZE 2
 
 int orangefs_inode_getattr(struct inode *, int);
+int __orangefs_inode_getattr(struct inode *inode, umode_t mode, int flags);
 
 int orangefs_inode_check_changed(struct inode *inode);
 
diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
index 46b7dcff18ac..334a2fd98c37 100644
--- a/fs/orangefs/orangefs-utils.c
+++ b/fs/orangefs/orangefs-utils.c
@@ -233,7 +233,7 @@ static int orangefs_inode_is_stale(struct inode *inode,
 	return 0;
 }
 
-int orangefs_inode_getattr(struct inode *inode, int flags)
+int __orangefs_inode_getattr(struct inode *inode, umode_t mode, int flags)
 {
 	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
 	struct orangefs_kernel_op_s *new_op;
@@ -369,7 +369,8 @@ again2:
 
 	/* special case: mark the root inode as sticky */
 	inode->i_mode = type | (is_root_handle(inode) ? S_ISVTX : 0) |
-	    orangefs_inode_perms(&new_op->downcall.resp.getattr.attributes);
+	    orangefs_inode_perms(&new_op->downcall.resp.getattr.attributes) |
+	    mode;
 
 	orangefs_inode->getattr_time = jiffies +
 	    orangefs_getattr_timeout_msecs*HZ/1000;
@@ -381,6 +382,11 @@ out:
 	return ret;
 }
 
+int orangefs_inode_getattr(struct inode *inode, int flags)
+{
+	return __orangefs_inode_getattr(inode, 0, flags);
+}
+
 int orangefs_inode_check_changed(struct inode *inode)
 {
 	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
-- 
cgit 


From 138060ba92b3b0d77c8e6818d0f33398b23ea42e Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 23 Sep 2022 10:29:39 +0200
Subject: fs: pass dentry to set acl method

The current way of setting and getting posix acls through the generic
xattr interface is error prone and type unsafe. The vfs needs to
interpret and fixup posix acls before storing or reporting it to
userspace. Various hacks exist to make this work. The code is hard to
understand and difficult to maintain in it's current form. Instead of
making this work by hacking posix acls through xattr handlers we are
building a dedicated posix acl api around the get and set inode
operations. This removes a lot of hackiness and makes the codepaths
easier to maintain. A lot of background can be found in [1].

Since some filesystem rely on the dentry being available to them when
setting posix acls (e.g., 9p and cifs) they cannot rely on set acl inode
operation. But since ->set_acl() is required in order to use the generic
posix acl xattr handlers filesystems that do not implement this inode
operation cannot use the handler and need to implement their own
dedicated posix acl handlers.

Update the ->set_acl() inode method to take a dentry argument. This
allows all filesystems to rely on ->set_acl().

As far as I can tell all codepaths can be switched to rely on the dentry
instead of just the inode. Note that the original motivation for passing
the dentry separate from the inode instead of just the dentry in the
xattr handlers was because of security modules that call
security_d_instantiate(). This hook is called during
d_instantiate_new(), d_add(), __d_instantiate_anon(), and
d_splice_alias() to initialize the inode's security context and possibly
to set security.* xattrs. Since this only affects security.* xattrs this
is completely irrelevant for posix acls.

Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1]
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/bad_inode.c                |  2 +-
 fs/btrfs/acl.c                |  3 ++-
 fs/btrfs/ctree.h              |  2 +-
 fs/btrfs/inode.c              |  2 +-
 fs/ceph/acl.c                 |  3 ++-
 fs/ceph/inode.c               |  2 +-
 fs/ceph/super.h               |  2 +-
 fs/ext2/acl.c                 |  3 ++-
 fs/ext2/acl.h                 |  2 +-
 fs/ext2/inode.c               |  2 +-
 fs/ext4/acl.c                 |  3 ++-
 fs/ext4/acl.h                 |  2 +-
 fs/ext4/inode.c               |  2 +-
 fs/f2fs/acl.c                 |  4 +++-
 fs/f2fs/acl.h                 |  2 +-
 fs/f2fs/file.c                |  2 +-
 fs/fuse/acl.c                 |  3 ++-
 fs/fuse/fuse_i.h              |  2 +-
 fs/gfs2/acl.c                 |  3 ++-
 fs/gfs2/acl.h                 |  2 +-
 fs/gfs2/inode.c               |  2 +-
 fs/jffs2/acl.c                |  3 ++-
 fs/jffs2/acl.h                |  2 +-
 fs/jffs2/fs.c                 |  2 +-
 fs/jfs/acl.c                  |  3 ++-
 fs/jfs/file.c                 |  2 +-
 fs/jfs/jfs_acl.h              |  2 +-
 fs/ksmbd/smb2pdu.c            |  4 ++--
 fs/ksmbd/smbacl.c             |  4 ++--
 fs/ksmbd/vfs.c                | 15 ++++++++-------
 fs/ksmbd/vfs.h                |  4 ++--
 fs/nfs/nfs3_fs.h              |  2 +-
 fs/nfs/nfs3acl.c              |  3 ++-
 fs/nfsd/nfs2acl.c             |  4 ++--
 fs/nfsd/nfs3acl.c             |  4 ++--
 fs/nfsd/vfs.c                 |  4 ++--
 fs/ntfs3/file.c               |  2 +-
 fs/ntfs3/ntfs_fs.h            |  4 ++--
 fs/ntfs3/xattr.c              |  9 +++++----
 fs/ocfs2/acl.c                |  3 ++-
 fs/ocfs2/acl.h                |  2 +-
 fs/orangefs/acl.c             |  5 +++--
 fs/orangefs/inode.c           |  7 ++++---
 fs/orangefs/orangefs-kernel.h |  4 ++--
 fs/posix_acl.c                | 22 +++++++++++++---------
 fs/reiserfs/acl.h             |  6 +++---
 fs/reiserfs/inode.c           |  2 +-
 fs/reiserfs/xattr_acl.c       |  9 ++++++---
 fs/xfs/xfs_acl.c              |  3 ++-
 fs/xfs/xfs_acl.h              |  2 +-
 fs/xfs/xfs_iops.c             | 10 ++++++----
 51 files changed, 112 insertions(+), 86 deletions(-)

(limited to 'fs')

diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 9d1cde8066cf..bc9917d372ed 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -154,7 +154,7 @@ static int bad_inode_tmpfile(struct user_namespace *mnt_userns,
 }
 
 static int bad_inode_set_acl(struct user_namespace *mnt_userns,
-			     struct inode *inode, struct posix_acl *acl,
+			     struct dentry *dentry, struct posix_acl *acl,
 			     int type)
 {
 	return -EIO;
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 548d6a5477b4..1e47b3ec3989 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -110,10 +110,11 @@ out:
 	return ret;
 }
 
-int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int btrfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 		  struct posix_acl *acl, int type)
 {
 	int ret;
+	struct inode *inode = d_inode(dentry);
 	umode_t old_mode = inode->i_mode;
 
 	if (type == ACL_TYPE_ACCESS && acl) {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 727595eee973..d93a4d027706 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3987,7 +3987,7 @@ static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag)
 /* acl.c */
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
 struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu);
-int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int btrfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 		  struct posix_acl *acl, int type);
 int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
 		    struct posix_acl *acl, int type);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b0807c59e321..312ba03c56ae 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5256,7 +5256,7 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr
 		err = btrfs_dirty_inode(inode);
 
 		if (!err && attr->ia_valid & ATTR_MODE)
-			err = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
+			err = posix_acl_chmod(mnt_userns, dentry, inode->i_mode);
 	}
 
 	return err;
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index f4fc8e0b847c..c7e8dd5b58d4 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -85,13 +85,14 @@ retry:
 	return acl;
 }
 
-int ceph_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int ceph_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 		 struct posix_acl *acl, int type)
 {
 	int ret = 0, size = 0;
 	const char *name = NULL;
 	char *value = NULL;
 	struct iattr newattrs;
+	struct inode *inode = d_inode(dentry);
 	struct timespec64 old_ctime = inode->i_ctime;
 	umode_t new_mode = inode->i_mode, old_mode = inode->i_mode;
 
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 4af5e55abc15..ca8aef906dc1 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -2255,7 +2255,7 @@ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 	err = __ceph_setattr(inode, attr);
 
 	if (err >= 0 && (attr->ia_valid & ATTR_MODE))
-		err = posix_acl_chmod(&init_user_ns, inode, attr->ia_mode);
+		err = posix_acl_chmod(&init_user_ns, dentry, attr->ia_mode);
 
 	return err;
 }
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 40630e6f691c..50e57a1fa32f 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1117,7 +1117,7 @@ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx);
 
 struct posix_acl *ceph_get_acl(struct inode *, int, bool);
 int ceph_set_acl(struct user_namespace *mnt_userns,
-		 struct inode *inode, struct posix_acl *acl, int type);
+		 struct dentry *dentry, struct posix_acl *acl, int type);
 int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
 		       struct ceph_acl_sec_ctx *as_ctx);
 void ceph_init_inode_acls(struct inode *inode,
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index bf298967c5b8..440d5f1e9d47 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -219,11 +219,12 @@ __ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
  * inode->i_mutex: down
  */
 int
-ext2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+ext2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 	     struct posix_acl *acl, int type)
 {
 	int error;
 	int update_mode = 0;
+	struct inode *inode = d_inode(dentry);
 	umode_t mode = inode->i_mode;
 
 	if (type == ACL_TYPE_ACCESS && acl) {
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index 925ab6287d35..3841becb94ff 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -56,7 +56,7 @@ static inline int ext2_acl_count(size_t size)
 
 /* acl.c */
 extern struct posix_acl *ext2_get_acl(struct inode *inode, int type, bool rcu);
-extern int ext2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+extern int ext2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 			struct posix_acl *acl, int type);
 extern int ext2_init_acl (struct inode *, struct inode *);
 
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 918ab2f9e4c0..e97e77be64f3 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1652,7 +1652,7 @@ int ext2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 	}
 	setattr_copy(&init_user_ns, inode, iattr);
 	if (iattr->ia_valid & ATTR_MODE)
-		error = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
+		error = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode);
 	mark_inode_dirty(inode);
 
 	return error;
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 57e82e25f8e2..a9f89539aeee 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -225,12 +225,13 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type,
 }
 
 int
-ext4_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+ext4_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 	     struct posix_acl *acl, int type)
 {
 	handle_t *handle;
 	int error, credits, retries = 0;
 	size_t acl_size = acl ? ext4_acl_size(acl->a_count) : 0;
+	struct inode *inode = d_inode(dentry);
 	umode_t mode = inode->i_mode;
 	int update_mode = 0;
 
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 3219669732bf..09c4a8a3b716 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -56,7 +56,7 @@ static inline int ext4_acl_count(size_t size)
 
 /* acl.c */
 struct posix_acl *ext4_get_acl(struct inode *inode, int type, bool rcu);
-int ext4_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int ext4_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 		 struct posix_acl *acl, int type);
 extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2b5ef1b64249..a8e12ce6673d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5550,7 +5550,7 @@ out_mmap_sem:
 		ext4_orphan_del(NULL, inode);
 
 	if (!error && (ia_valid & ATTR_MODE))
-		rc = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
+		rc = posix_acl_chmod(mnt_userns, dentry, inode->i_mode);
 
 err_out:
 	if  (error)
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 5bbc44a5216e..c1c74aa658ae 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -276,9 +276,11 @@ static int __f2fs_set_acl(struct user_namespace *mnt_userns,
 	return error;
 }
 
-int f2fs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int f2fs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 		 struct posix_acl *acl, int type)
 {
+	struct inode *inode = d_inode(dentry);
+
 	if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
 		return -EIO;
 
diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h
index a26e33cab4ff..ea2bbb3f264b 100644
--- a/fs/f2fs/acl.h
+++ b/fs/f2fs/acl.h
@@ -34,7 +34,7 @@ struct f2fs_acl_header {
 #ifdef CONFIG_F2FS_FS_POSIX_ACL
 
 extern struct posix_acl *f2fs_get_acl(struct inode *, int, bool);
-extern int f2fs_set_acl(struct user_namespace *, struct inode *,
+extern int f2fs_set_acl(struct user_namespace *, struct dentry *,
 			struct posix_acl *, int);
 extern int f2fs_init_acl(struct inode *, struct inode *, struct page *,
 							struct page *);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 82cda1258227..122339482bdc 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1025,7 +1025,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 	__setattr_copy(mnt_userns, inode, attr);
 
 	if (attr->ia_valid & ATTR_MODE) {
-		err = posix_acl_chmod(mnt_userns, inode, f2fs_get_inode_mode(inode));
+		err = posix_acl_chmod(mnt_userns, dentry, f2fs_get_inode_mode(inode));
 
 		if (is_inode_flag_set(inode, FI_ACL_MODE)) {
 			if (!err)
diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c
index 337cb29a8dd5..8edd0f313515 100644
--- a/fs/fuse/acl.c
+++ b/fs/fuse/acl.c
@@ -53,9 +53,10 @@ struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu)
 	return acl;
 }
 
-int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int fuse_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 		 struct posix_acl *acl, int type)
 {
+	struct inode *inode = d_inode(dentry);
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	const char *name;
 	int ret;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 98a9cf531873..26a7c524eb70 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1269,7 +1269,7 @@ extern const struct xattr_handler *fuse_no_acl_xattr_handlers[];
 
 struct posix_acl;
 struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu);
-int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int fuse_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 		 struct posix_acl *acl, int type);
 
 /* readdir.c */
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 734d1f05d823..3dcde4912413 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -109,9 +109,10 @@ out:
 	return error;
 }
 
-int gfs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int gfs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 		 struct posix_acl *acl, int type)
 {
+	struct inode *inode = d_inode(dentry);
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_holder gh;
 	bool need_unlock = false;
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index cd180ca7c959..b8de8c148f5c 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -13,7 +13,7 @@
 
 extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type, bool rcu);
 extern int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
-extern int gfs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+extern int gfs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 			struct posix_acl *acl, int type);
 
 #endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 04a201584fa7..314b9ce70682 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1997,7 +1997,7 @@ static int gfs2_setattr(struct user_namespace *mnt_userns,
 	else {
 		error = gfs2_setattr_simple(inode, attr);
 		if (!error && attr->ia_valid & ATTR_MODE)
-			error = posix_acl_chmod(&init_user_ns, inode,
+			error = posix_acl_chmod(&init_user_ns, dentry,
 						inode->i_mode);
 	}
 
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index e945e3484788..8bb58ce5c06c 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -229,10 +229,11 @@ static int __jffs2_set_acl(struct inode *inode, int xprefix, struct posix_acl *a
 	return rc;
 }
 
-int jffs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int jffs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 		  struct posix_acl *acl, int type)
 {
 	int rc, xprefix;
+	struct inode *inode = d_inode(dentry);
 
 	switch (type) {
 	case ACL_TYPE_ACCESS:
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index 9d9fb7cf093e..ca36a6eca594 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -28,7 +28,7 @@ struct jffs2_acl_header {
 #ifdef CONFIG_JFFS2_FS_POSIX_ACL
 
 struct posix_acl *jffs2_get_acl(struct inode *inode, int type, bool rcu);
-int jffs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int jffs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 		  struct posix_acl *acl, int type);
 extern int jffs2_init_acl_pre(struct inode *, struct inode *, umode_t *);
 extern int jffs2_init_acl_post(struct inode *);
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 39cec28096a7..66af51c41619 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -202,7 +202,7 @@ int jffs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 
 	rc = jffs2_do_setattr(inode, iattr);
 	if (!rc && (iattr->ia_valid & ATTR_MODE))
-		rc = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
+		rc = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode);
 
 	return rc;
 }
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index a653f34c6e26..3b667eccc73b 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -94,12 +94,13 @@ out:
 	return rc;
 }
 
-int jfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int jfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 		struct posix_acl *acl, int type)
 {
 	int rc;
 	tid_t tid;
 	int update_mode = 0;
+	struct inode *inode = d_inode(dentry);
 	umode_t mode = inode->i_mode;
 
 	tid = txBegin(inode->i_sb, 0);
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 332dc9ac47a9..e3eb9c36751f 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -123,7 +123,7 @@ int jfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 	mark_inode_dirty(inode);
 
 	if (iattr->ia_valid & ATTR_MODE)
-		rc = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
+		rc = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode);
 	return rc;
 }
 
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index 3de40286d31f..f0704a25835f 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -8,7 +8,7 @@
 #ifdef CONFIG_JFS_POSIX_ACL
 
 struct posix_acl *jfs_get_acl(struct inode *inode, int type, bool rcu);
-int jfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int jfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 		struct posix_acl *acl, int type);
 int jfs_init_acl(tid_t, struct inode *, struct inode *);
 
diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
index b2fc85d440d0..2466edc57424 100644
--- a/fs/ksmbd/smb2pdu.c
+++ b/fs/ksmbd/smb2pdu.c
@@ -2956,7 +2956,7 @@ int smb2_open(struct ksmbd_work *work)
 		struct inode *inode = d_inode(path.dentry);
 
 		posix_acl_rc = ksmbd_vfs_inherit_posix_acl(user_ns,
-							   inode,
+							   path.dentry,
 							   d_inode(path.dentry->d_parent));
 		if (posix_acl_rc)
 			ksmbd_debug(SMB, "inherit posix acl failed : %d\n", posix_acl_rc);
@@ -2972,7 +2972,7 @@ int smb2_open(struct ksmbd_work *work)
 			if (rc) {
 				if (posix_acl_rc)
 					ksmbd_vfs_set_init_posix_acl(user_ns,
-								     inode);
+								     path.dentry);
 
 				if (test_share_config_flag(work->tcon->share_conf,
 							   KSMBD_SHARE_FLAG_ACL_XATTR)) {
diff --git a/fs/ksmbd/smbacl.c b/fs/ksmbd/smbacl.c
index b05ff9b146b5..a1e05fe997fe 100644
--- a/fs/ksmbd/smbacl.c
+++ b/fs/ksmbd/smbacl.c
@@ -1386,14 +1386,14 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon,
 	ksmbd_vfs_remove_acl_xattrs(user_ns, path->dentry);
 	/* Update posix acls */
 	if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && fattr.cf_dacls) {
-		rc = set_posix_acl(user_ns, inode,
+		rc = set_posix_acl(user_ns, path->dentry,
 				   ACL_TYPE_ACCESS, fattr.cf_acls);
 		if (rc < 0)
 			ksmbd_debug(SMB,
 				    "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n",
 				    rc);
 		if (S_ISDIR(inode->i_mode) && fattr.cf_dacls) {
-			rc = set_posix_acl(user_ns, inode,
+			rc = set_posix_acl(user_ns, path->dentry,
 					   ACL_TYPE_DEFAULT, fattr.cf_dacls);
 			if (rc)
 				ksmbd_debug(SMB,
diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c
index 8de970d6146f..7dee8b78762d 100644
--- a/fs/ksmbd/vfs.c
+++ b/fs/ksmbd/vfs.c
@@ -1824,10 +1824,11 @@ void ksmbd_vfs_posix_lock_unblock(struct file_lock *flock)
 }
 
 int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns,
-				 struct inode *inode)
+				 struct dentry *dentry)
 {
 	struct posix_acl_state acl_state;
 	struct posix_acl *acls;
+	struct inode *inode = d_inode(dentry);
 	int rc;
 
 	if (!IS_ENABLED(CONFIG_FS_POSIX_ACL))
@@ -1856,14 +1857,13 @@ int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns,
 		return -ENOMEM;
 	}
 	posix_state_to_acl(&acl_state, acls->a_entries);
-	rc = set_posix_acl(user_ns, inode, ACL_TYPE_ACCESS, acls);
+	rc = set_posix_acl(user_ns, dentry, ACL_TYPE_ACCESS, acls);
 	if (rc < 0)
 		ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n",
 			    rc);
 	else if (S_ISDIR(inode->i_mode)) {
 		posix_state_to_acl(&acl_state, acls->a_entries);
-		rc = set_posix_acl(user_ns, inode, ACL_TYPE_DEFAULT,
-				   acls);
+		rc = set_posix_acl(user_ns, dentry, ACL_TYPE_DEFAULT, acls);
 		if (rc < 0)
 			ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n",
 				    rc);
@@ -1874,10 +1874,11 @@ int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns,
 }
 
 int ksmbd_vfs_inherit_posix_acl(struct user_namespace *user_ns,
-				struct inode *inode, struct inode *parent_inode)
+				struct dentry *dentry, struct inode *parent_inode)
 {
 	struct posix_acl *acls;
 	struct posix_acl_entry *pace;
+	struct inode *inode = d_inode(dentry);
 	int rc, i;
 
 	if (!IS_ENABLED(CONFIG_FS_POSIX_ACL))
@@ -1895,12 +1896,12 @@ int ksmbd_vfs_inherit_posix_acl(struct user_namespace *user_ns,
 		}
 	}
 
-	rc = set_posix_acl(user_ns, inode, ACL_TYPE_ACCESS, acls);
+	rc = set_posix_acl(user_ns, dentry, ACL_TYPE_ACCESS, acls);
 	if (rc < 0)
 		ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n",
 			    rc);
 	if (S_ISDIR(inode->i_mode)) {
-		rc = set_posix_acl(user_ns, inode, ACL_TYPE_DEFAULT,
+		rc = set_posix_acl(user_ns, dentry, ACL_TYPE_DEFAULT,
 				   acls);
 		if (rc < 0)
 			ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n",
diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h
index 593059ca8511..0d73d735cc39 100644
--- a/fs/ksmbd/vfs.h
+++ b/fs/ksmbd/vfs.h
@@ -160,8 +160,8 @@ int ksmbd_vfs_get_dos_attrib_xattr(struct user_namespace *user_ns,
 				   struct dentry *dentry,
 				   struct xattr_dos_attrib *da);
 int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns,
-				 struct inode *inode);
+				 struct dentry *dentry);
 int ksmbd_vfs_inherit_posix_acl(struct user_namespace *user_ns,
-				struct inode *inode,
+				struct dentry *dentry,
 				struct inode *parent_inode);
 #endif /* __KSMBD_VFS_H__ */
diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h
index 03a4e679fd99..df9ca56db347 100644
--- a/fs/nfs/nfs3_fs.h
+++ b/fs/nfs/nfs3_fs.h
@@ -12,7 +12,7 @@
  */
 #ifdef CONFIG_NFS_V3_ACL
 extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type, bool rcu);
-extern int nfs3_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+extern int nfs3_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 			struct posix_acl *acl, int type);
 extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 		struct posix_acl *dfacl);
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 93de0b58647a..22890d97a9e4 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -255,10 +255,11 @@ int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 
 }
 
-int nfs3_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int nfs3_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 		 struct posix_acl *acl, int type)
 {
 	struct posix_acl *orig = acl, *dfacl = NULL, *alloc;
+	struct inode *inode = d_inode(dentry);
 	int status;
 
 	if (S_ISDIR(inode->i_mode)) {
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 13e6e6897f6c..b1839638500c 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -113,11 +113,11 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst *rqstp)
 
 	inode_lock(inode);
 
-	error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_ACCESS,
+	error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_ACCESS,
 			      argp->acl_access);
 	if (error)
 		goto out_drop_lock;
-	error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_DEFAULT,
+	error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_DEFAULT,
 			      argp->acl_default);
 	if (error)
 		goto out_drop_lock;
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 2fb9ee356455..da4a0d09bd84 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -103,11 +103,11 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst *rqstp)
 
 	inode_lock(inode);
 
-	error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_ACCESS,
+	error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_ACCESS,
 			      argp->acl_access);
 	if (error)
 		goto out_drop_lock;
-	error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_DEFAULT,
+	error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_DEFAULT,
 			      argp->acl_default);
 
 out_drop_lock:
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index f650afedd67f..4eded0729108 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -480,12 +480,12 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
 			attr->na_seclabel->data, attr->na_seclabel->len);
 	if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && attr->na_pacl)
 		attr->na_aclerr = set_posix_acl(&init_user_ns,
-						inode, ACL_TYPE_ACCESS,
+						dentry, ACL_TYPE_ACCESS,
 						attr->na_pacl);
 	if (IS_ENABLED(CONFIG_FS_POSIX_ACL) &&
 	    !attr->na_aclerr && attr->na_dpacl && S_ISDIR(inode->i_mode))
 		attr->na_aclerr = set_posix_acl(&init_user_ns,
-						inode, ACL_TYPE_DEFAULT,
+						dentry, ACL_TYPE_DEFAULT,
 						attr->na_dpacl);
 	inode_unlock(inode);
 	if (size_change)
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index 4f2ffc7ef296..ee5101e6bd68 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -802,7 +802,7 @@ int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 	setattr_copy(mnt_userns, inode, attr);
 
 	if (mode != inode->i_mode) {
-		err = ntfs_acl_chmod(mnt_userns, inode);
+		err = ntfs_acl_chmod(mnt_userns, dentry);
 		if (err)
 			goto out;
 
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
index 2c791222c4e2..a4d292809a33 100644
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
@@ -843,7 +843,7 @@ int ntfs_cmp_names_cpu(const struct cpu_str *uni1, const struct le_str *uni2,
 /* globals from xattr.c */
 #ifdef CONFIG_NTFS3_FS_POSIX_ACL
 struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu);
-int ntfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int ntfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 		 struct posix_acl *acl, int type);
 int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode,
 		  struct inode *dir);
@@ -852,7 +852,7 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode,
 #define ntfs_set_acl NULL
 #endif
 
-int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode);
+int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct dentry *dentry);
 int ntfs_permission(struct user_namespace *mnt_userns, struct inode *inode,
 		    int mask);
 ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c
index 7de8718c68a9..aafe98ee0b21 100644
--- a/fs/ntfs3/xattr.c
+++ b/fs/ntfs3/xattr.c
@@ -619,10 +619,10 @@ out:
 /*
  * ntfs_set_acl - inode_operations::set_acl
  */
-int ntfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int ntfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 		 struct posix_acl *acl, int type)
 {
-	return ntfs_set_acl_ex(mnt_userns, inode, acl, type, false);
+	return ntfs_set_acl_ex(mnt_userns, d_inode(dentry), acl, type, false);
 }
 
 /*
@@ -664,8 +664,9 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode,
 /*
  * ntfs_acl_chmod - Helper for ntfs3_setattr().
  */
-int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode)
+int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct dentry *dentry)
 {
+	struct inode *inode = d_inode(dentry);
 	struct super_block *sb = inode->i_sb;
 
 	if (!(sb->s_flags & SB_POSIXACL))
@@ -674,7 +675,7 @@ int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode)
 	if (S_ISLNK(inode->i_mode))
 		return -EOPNOTSUPP;
 
-	return posix_acl_chmod(mnt_userns, inode, inode->i_mode);
+	return posix_acl_chmod(mnt_userns, dentry, inode->i_mode);
 }
 
 /*
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 23a72a423955..9f19cf9a5a9f 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -260,12 +260,13 @@ static int ocfs2_set_acl(handle_t *handle,
 	return ret;
 }
 
-int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 		      struct posix_acl *acl, int type)
 {
 	struct buffer_head *bh = NULL;
 	int status, had_lock;
 	struct ocfs2_lock_holder oh;
+	struct inode *inode = d_inode(dentry);
 
 	had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh);
 	if (had_lock < 0)
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 95a57c888ab6..a897c4e41b26 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -17,7 +17,7 @@ struct ocfs2_acl_entry {
 };
 
 struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type, bool rcu);
-int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 		      struct posix_acl *acl, int type);
 extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *);
 extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c
index 0e2db840c217..c5da2091cefb 100644
--- a/fs/orangefs/acl.c
+++ b/fs/orangefs/acl.c
@@ -118,12 +118,13 @@ out:
 	return error;
 }
 
-int orangefs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int orangefs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 		     struct posix_acl *acl, int type)
 {
 	int error;
 	struct iattr iattr;
 	int rc;
+	struct inode *inode = d_inode(dentry);
 
 	memset(&iattr, 0, sizeof iattr);
 
@@ -152,7 +153,7 @@ int orangefs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
 	rc = __orangefs_set_acl(inode, acl, type);
 
 	if (!rc && (iattr.ia_valid == ATTR_MODE))
-		rc = __orangefs_setattr_mode(inode, &iattr);
+		rc = __orangefs_setattr_mode(dentry, &iattr);
 
 	return rc;
 }
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 35788cde6d24..825872d8d377 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -833,14 +833,15 @@ out:
 	return ret;
 }
 
-int __orangefs_setattr_mode(struct inode *inode, struct iattr *iattr)
+int __orangefs_setattr_mode(struct dentry *dentry, struct iattr *iattr)
 {
 	int ret;
+	struct inode *inode = d_inode(dentry);
 
 	ret = __orangefs_setattr(inode, iattr);
 	/* change mode on a file that has ACLs */
 	if (!ret && (iattr->ia_valid & ATTR_MODE))
-		ret = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
+		ret = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode);
 	return ret;
 }
 
@@ -856,7 +857,7 @@ int orangefs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 	ret = setattr_prepare(&init_user_ns, dentry, iattr);
 	if (ret)
 	        goto out;
-	ret = __orangefs_setattr_mode(d_inode(dentry), iattr);
+	ret = __orangefs_setattr_mode(dentry, iattr);
 	sync_inode_metadata(d_inode(dentry), 1);
 out:
 	gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_setattr: returning %d\n",
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 3298b15684b7..55cd6d50eea1 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -107,7 +107,7 @@ extern const struct xattr_handler *orangefs_xattr_handlers[];
 
 extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu);
 extern int orangefs_set_acl(struct user_namespace *mnt_userns,
-			    struct inode *inode, struct posix_acl *acl,
+			    struct dentry *dentry, struct posix_acl *acl,
 			    int type);
 int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
 
@@ -361,7 +361,7 @@ struct inode *orangefs_new_inode(struct super_block *sb,
 			      struct orangefs_object_kref *ref);
 
 int __orangefs_setattr(struct inode *, struct iattr *);
-int __orangefs_setattr_mode(struct inode *inode, struct iattr *iattr);
+int __orangefs_setattr_mode(struct dentry *dentry, struct iattr *iattr);
 int orangefs_setattr(struct user_namespace *, struct dentry *, struct iattr *);
 
 int orangefs_getattr(struct user_namespace *mnt_userns, const struct path *path,
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 74dc0f571dc9..c4bc58a1160e 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -578,19 +578,20 @@ EXPORT_SYMBOL(__posix_acl_chmod);
  * posix_acl_chmod - chmod a posix acl
  *
  * @mnt_userns:	user namespace of the mount @inode was found from
- * @inode:	inode to check permissions on
+ * @dentry:	dentry to check permissions on
  * @mode:	the new mode of @inode
  *
- * If the inode has been found through an idmapped mount the user namespace of
+ * If the dentry has been found through an idmapped mount the user namespace of
  * the vfsmount must be passed through @mnt_userns. This function will then
  * take care to map the inode according to @mnt_userns before checking
  * permissions. On non-idmapped mounts or if permission checking is to be
  * performed on the raw inode simply passs init_user_ns.
  */
 int
- posix_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode,
+ posix_acl_chmod(struct user_namespace *mnt_userns, struct dentry *dentry,
 		    umode_t mode)
 {
+	struct inode *inode = d_inode(dentry);
 	struct posix_acl *acl;
 	int ret = 0;
 
@@ -609,7 +610,7 @@ int
 	ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
 	if (ret)
 		return ret;
-	ret = inode->i_op->set_acl(mnt_userns, inode, acl, ACL_TYPE_ACCESS);
+	ret = inode->i_op->set_acl(mnt_userns, dentry, acl, ACL_TYPE_ACCESS);
 	posix_acl_release(acl);
 	return ret;
 }
@@ -1139,9 +1140,11 @@ posix_acl_xattr_get(const struct xattr_handler *handler,
 }
 
 int
-set_posix_acl(struct user_namespace *mnt_userns, struct inode *inode,
+set_posix_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 	      int type, struct posix_acl *acl)
 {
+	struct inode *inode = d_inode(dentry);
+
 	if (!IS_POSIXACL(inode))
 		return -EOPNOTSUPP;
 	if (!inode->i_op->set_acl)
@@ -1157,14 +1160,14 @@ set_posix_acl(struct user_namespace *mnt_userns, struct inode *inode,
 		if (ret)
 			return ret;
 	}
-	return inode->i_op->set_acl(mnt_userns, inode, acl, type);
+	return inode->i_op->set_acl(mnt_userns, dentry, acl, type);
 }
 EXPORT_SYMBOL(set_posix_acl);
 
 static int
 posix_acl_xattr_set(const struct xattr_handler *handler,
 			   struct user_namespace *mnt_userns,
-			   struct dentry *unused, struct inode *inode,
+			   struct dentry *dentry, struct inode *inode,
 			   const char *name, const void *value, size_t size,
 			   int flags)
 {
@@ -1186,7 +1189,7 @@ posix_acl_xattr_set(const struct xattr_handler *handler,
 		if (IS_ERR(acl))
 			return PTR_ERR(acl);
 	}
-	ret = set_posix_acl(mnt_userns, inode, handler->flags, acl);
+	ret = set_posix_acl(mnt_userns, dentry, handler->flags, acl);
 	posix_acl_release(acl);
 	return ret;
 }
@@ -1215,10 +1218,11 @@ const struct xattr_handler posix_acl_default_xattr_handler = {
 };
 EXPORT_SYMBOL_GPL(posix_acl_default_xattr_handler);
 
-int simple_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int simple_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 		   struct posix_acl *acl, int type)
 {
 	int error;
+	struct inode *inode = d_inode(dentry);
 
 	if (type == ACL_TYPE_ACCESS) {
 		error = posix_acl_update_mode(mnt_userns, inode,
diff --git a/fs/reiserfs/acl.h b/fs/reiserfs/acl.h
index d9052b8ce6dd..29c503a06db4 100644
--- a/fs/reiserfs/acl.h
+++ b/fs/reiserfs/acl.h
@@ -49,9 +49,9 @@ static inline int reiserfs_acl_count(size_t size)
 
 #ifdef CONFIG_REISERFS_FS_POSIX_ACL
 struct posix_acl *reiserfs_get_acl(struct inode *inode, int type, bool rcu);
-int reiserfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int reiserfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 		     struct posix_acl *acl, int type);
-int reiserfs_acl_chmod(struct inode *inode);
+int reiserfs_acl_chmod(struct dentry *dentry);
 int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
 				 struct inode *dir, struct dentry *dentry,
 				 struct inode *inode);
@@ -63,7 +63,7 @@ int reiserfs_cache_default_acl(struct inode *dir);
 #define reiserfs_get_acl NULL
 #define reiserfs_set_acl NULL
 
-static inline int reiserfs_acl_chmod(struct inode *inode)
+static inline int reiserfs_acl_chmod(struct dentry *dentry)
 {
 	return 0;
 }
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index b9580a6515ee..c7d1fa526dea 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -3404,7 +3404,7 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 
 	if (!error && reiserfs_posixacl(inode->i_sb)) {
 		if (attr->ia_valid & ATTR_MODE)
-			error = reiserfs_acl_chmod(inode);
+			error = reiserfs_acl_chmod(dentry);
 	}
 
 out:
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index d6fcddc46f5b..966ba48e33ec 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -18,7 +18,7 @@ static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th,
 
 
 int
-reiserfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+reiserfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 		 struct posix_acl *acl, int type)
 {
 	int error, error2;
@@ -26,6 +26,7 @@ reiserfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
 	size_t jcreate_blocks;
 	int size = acl ? posix_acl_xattr_size(acl->a_count) : 0;
 	int update_mode = 0;
+	struct inode *inode = d_inode(dentry);
 	umode_t mode = inode->i_mode;
 
 	/*
@@ -396,13 +397,15 @@ int reiserfs_cache_default_acl(struct inode *inode)
 /*
  * Called under i_mutex
  */
-int reiserfs_acl_chmod(struct inode *inode)
+int reiserfs_acl_chmod(struct dentry *dentry)
 {
+	struct inode *inode = d_inode(dentry);
+
 	if (IS_PRIVATE(inode))
 		return 0;
 	if (get_inode_sd_version(inode) == STAT_DATA_V1 ||
 	    !reiserfs_posixacl(inode->i_sb))
 		return 0;
 
-	return posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
+	return posix_acl_chmod(&init_user_ns, dentry, inode->i_mode);
 }
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index b744c62052b6..a05f44eb8178 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -242,12 +242,13 @@ xfs_acl_set_mode(
 }
 
 int
-xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+xfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 	    struct posix_acl *acl, int type)
 {
 	umode_t mode;
 	bool set_mode = false;
 	int error = 0;
+	struct inode *inode = d_inode(dentry);
 
 	if (!acl)
 		goto set_acl;
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 263404d0bfda..dcd176149c7a 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -11,7 +11,7 @@ struct posix_acl;
 
 #ifdef CONFIG_XFS_POSIX_ACL
 extern struct posix_acl *xfs_get_acl(struct inode *inode, int type, bool rcu);
-extern int xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+extern int xfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 		       struct posix_acl *acl, int type);
 extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
 void xfs_forget_acl(struct inode *inode, const char *name);
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 2e10e1c66ad6..ab266ba65a84 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -651,6 +651,7 @@ xfs_vn_change_ok(
 static int
 xfs_setattr_nonsize(
 	struct user_namespace	*mnt_userns,
+	struct dentry		*dentry,
 	struct xfs_inode	*ip,
 	struct iattr		*iattr)
 {
@@ -757,7 +758,7 @@ xfs_setattr_nonsize(
 	 * 	     Posix ACL code seems to care about this issue either.
 	 */
 	if (mask & ATTR_MODE) {
-		error = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
+		error = posix_acl_chmod(mnt_userns, dentry, inode->i_mode);
 		if (error)
 			return error;
 	}
@@ -779,6 +780,7 @@ out_dqrele:
 STATIC int
 xfs_setattr_size(
 	struct user_namespace	*mnt_userns,
+	struct dentry		*dentry,
 	struct xfs_inode	*ip,
 	struct iattr		*iattr)
 {
@@ -810,7 +812,7 @@ xfs_setattr_size(
 		 * Use the regular setattr path to update the timestamps.
 		 */
 		iattr->ia_valid &= ~ATTR_SIZE;
-		return xfs_setattr_nonsize(mnt_userns, ip, iattr);
+		return xfs_setattr_nonsize(mnt_userns, dentry, ip, iattr);
 	}
 
 	/*
@@ -987,7 +989,7 @@ xfs_vn_setattr_size(
 	error = xfs_vn_change_ok(mnt_userns, dentry, iattr);
 	if (error)
 		return error;
-	return xfs_setattr_size(mnt_userns, ip, iattr);
+	return xfs_setattr_size(mnt_userns, dentry, ip, iattr);
 }
 
 STATIC int
@@ -1019,7 +1021,7 @@ xfs_vn_setattr(
 
 		error = xfs_vn_change_ok(mnt_userns, dentry, iattr);
 		if (!error)
-			error = xfs_setattr_nonsize(mnt_userns, ip, iattr);
+			error = xfs_setattr_nonsize(mnt_userns, dentry, ip, iattr);
 	}
 
 	return error;
-- 
cgit 


From 06b4e09aab6c1ae6b3991a729d2fcb31e358e7f3 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 11 Oct 2022 13:01:11 -0700
Subject: pstore/ram: Set freed addresses to NULL

For good measure, set all the freed addresses to NULL when managing
przs.

Cc: Anton Vorontsov <anton@enomsg.org>
Cc: Colin Cross <ccross@android.com>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-and-tested-by: Guilherme G. Piccoli <gpiccoli@igalia.com>
Link: https://lore.kernel.org/r/20221011200112.731334-5-keescook@chromium.org
---
 fs/pstore/ram.c          | 13 ++++++++-----
 fs/pstore/ram_core.c     | 11 +++++++++--
 fs/pstore/ram_internal.h |  2 +-
 3 files changed, 18 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index f5bf360cf905..5a39fa8522b0 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -453,25 +453,27 @@ static void ramoops_free_przs(struct ramoops_context *cxt)
 	int i;
 
 	/* Free pmsg PRZ */
-	persistent_ram_free(cxt->mprz);
+	persistent_ram_free(&cxt->mprz);
 
 	/* Free console PRZ */
-	persistent_ram_free(cxt->cprz);
+	persistent_ram_free(&cxt->cprz);
 
 	/* Free dump PRZs */
 	if (cxt->dprzs) {
 		for (i = 0; i < cxt->max_dump_cnt; i++)
-			persistent_ram_free(cxt->dprzs[i]);
+			persistent_ram_free(&cxt->dprzs[i]);
 
 		kfree(cxt->dprzs);
+		cxt->dprzs = NULL;
 		cxt->max_dump_cnt = 0;
 	}
 
 	/* Free ftrace PRZs */
 	if (cxt->fprzs) {
 		for (i = 0; i < cxt->max_ftrace_cnt; i++)
-			persistent_ram_free(cxt->fprzs[i]);
+			persistent_ram_free(&cxt->fprzs[i]);
 		kfree(cxt->fprzs);
+		cxt->fprzs = NULL;
 		cxt->max_ftrace_cnt = 0;
 	}
 }
@@ -555,9 +557,10 @@ static int ramoops_init_przs(const char *name,
 
 			while (i > 0) {
 				i--;
-				persistent_ram_free(prz_ar[i]);
+				persistent_ram_free(&prz_ar[i]);
 			}
 			kfree(prz_ar);
+			prz_ar = NULL;
 			goto fail;
 		}
 		*paddr += zone_sz;
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index 9e1047f4316d..97dde525150a 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -544,8 +544,14 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig,
 	return 0;
 }
 
-void persistent_ram_free(struct persistent_ram_zone *prz)
+void persistent_ram_free(struct persistent_ram_zone **_prz)
 {
+	struct persistent_ram_zone *prz;
+
+	if (!_prz)
+		return;
+
+	prz = *_prz;
 	if (!prz)
 		return;
 
@@ -569,6 +575,7 @@ void persistent_ram_free(struct persistent_ram_zone *prz)
 	persistent_ram_free_old(prz);
 	kfree(prz->label);
 	kfree(prz);
+	*_prz = NULL;
 }
 
 struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
@@ -605,6 +612,6 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
 
 	return prz;
 err:
-	persistent_ram_free(prz);
+	persistent_ram_free(&prz);
 	return ERR_PTR(ret);
 }
diff --git a/fs/pstore/ram_internal.h b/fs/pstore/ram_internal.h
index 440ee7a35e10..5f694698351f 100644
--- a/fs/pstore/ram_internal.h
+++ b/fs/pstore/ram_internal.h
@@ -82,7 +82,7 @@ struct persistent_ram_zone {
 struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
 			u32 sig, struct persistent_ram_ecc_info *ecc_info,
 			unsigned int memtype, u32 flags, char *label);
-void persistent_ram_free(struct persistent_ram_zone *prz);
+void persistent_ram_free(struct persistent_ram_zone **_prz);
 void persistent_ram_zap(struct persistent_ram_zone *prz);
 
 int persistent_ram_write(struct persistent_ram_zone *prz, const void *s,
-- 
cgit 


From 38b91847c314f49c80e30062549d4709a3754ea6 Mon Sep 17 00:00:00 2001
From: "Guilherme G. Piccoli" <gpiccoli@igalia.com>
Date: Thu, 13 Oct 2022 18:06:46 -0300
Subject: pstore: Alert on backend write error

The pstore dump function doesn't alert at all on errors - despite
pstore is usually a last resource and if it fails users won't be
able to read the kernel log, this is not the case for server users
with serial access, for example.

So, let's at least attempt to inform such advanced users on the first
backend writing error detected during the kmsg dump - this is also
very useful for pstore debugging purposes.

Signed-off-by: Guilherme G. Piccoli <gpiccoli@igalia.com>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20221013210648.137452-2-gpiccoli@igalia.com
---
 fs/pstore/platform.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 06c2c66af332..cbc0b468c1ab 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -393,6 +393,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
 	const char	*why;
 	unsigned int	part = 1;
 	unsigned long	flags = 0;
+	int		saved_ret = 0;
 	int		ret;
 
 	why = kmsg_dump_reason_str(reason);
@@ -463,12 +464,21 @@ static void pstore_dump(struct kmsg_dumper *dumper,
 		if (ret == 0 && reason == KMSG_DUMP_OOPS) {
 			pstore_new_entry = 1;
 			pstore_timer_kick();
+		} else {
+			/* Preserve only the first non-zero returned value. */
+			if (!saved_ret)
+				saved_ret = ret;
 		}
 
 		total += record.size;
 		part++;
 	}
 	spin_unlock_irqrestore(&psinfo->buf_lock, flags);
+
+	if (saved_ret) {
+		pr_err_once("backend (%s) writing error (%d)\n", psinfo->name,
+			    saved_ret);
+	}
 }
 
 static struct kmsg_dumper pstore_dumper = {
-- 
cgit 


From cac2f8b8d8b50ef32b3e34f6dcbbf08937e4f616 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 22 Sep 2022 17:17:00 +0200
Subject: fs: rename current get acl method

The current way of setting and getting posix acls through the generic
xattr interface is error prone and type unsafe. The vfs needs to
interpret and fixup posix acls before storing or reporting it to
userspace. Various hacks exist to make this work. The code is hard to
understand and difficult to maintain in it's current form. Instead of
making this work by hacking posix acls through xattr handlers we are
building a dedicated posix acl api around the get and set inode
operations. This removes a lot of hackiness and makes the codepaths
easier to maintain. A lot of background can be found in [1].

The current inode operation for getting posix acls takes an inode
argument but various filesystems (e.g., 9p, cifs, overlayfs) need access
to the dentry. In contrast to the ->set_acl() inode operation we cannot
simply extend ->get_acl() to take a dentry argument. The ->get_acl()
inode operation is called from:

acl_permission_check()
-> check_acl()
   -> get_acl()

which is part of generic_permission() which in turn is part of
inode_permission(). Both generic_permission() and inode_permission() are
called in the ->permission() handler of various filesystems (e.g.,
overlayfs). So simply passing a dentry argument to ->get_acl() would
amount to also having to pass a dentry argument to ->permission(). We
should avoid this unnecessary change.

So instead of extending the existing inode operation rename it from
->get_acl() to ->get_inode_acl() and add a ->get_acl() method later that
passes a dentry argument and which filesystems that need access to the
dentry can implement instead of ->get_inode_acl(). Filesystems like cifs
which allow setting and getting posix acls but not using them for
permission checking during lookup can simply not implement
->get_inode_acl().

This is intended to be a non-functional change.

Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1]
Suggested-by/Inspired-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/9p/vfs_inode_dotl.c  |  4 ++--
 fs/bad_inode.c          |  2 +-
 fs/btrfs/inode.c        |  6 +++---
 fs/ceph/dir.c           |  2 +-
 fs/ceph/inode.c         |  2 +-
 fs/erofs/inode.c        |  6 +++---
 fs/erofs/namei.c        |  2 +-
 fs/ext2/file.c          |  2 +-
 fs/ext2/namei.c         |  4 ++--
 fs/ext4/file.c          |  2 +-
 fs/ext4/ialloc.c        |  2 +-
 fs/ext4/namei.c         |  4 ++--
 fs/f2fs/file.c          |  2 +-
 fs/f2fs/namei.c         |  4 ++--
 fs/fuse/dir.c           |  4 ++--
 fs/gfs2/inode.c         |  4 ++--
 fs/jffs2/dir.c          |  2 +-
 fs/jffs2/file.c         |  2 +-
 fs/jfs/file.c           |  2 +-
 fs/jfs/namei.c          |  2 +-
 fs/ksmbd/smb2pdu.c      |  4 ++--
 fs/ksmbd/smbacl.c       |  2 +-
 fs/ksmbd/vfs.c          |  4 ++--
 fs/namei.c              |  4 ++--
 fs/nfs/nfs3acl.c        |  6 +++---
 fs/nfs/nfs3proc.c       |  4 ++--
 fs/nfsd/nfs2acl.c       |  4 ++--
 fs/nfsd/nfs3acl.c       |  4 ++--
 fs/nfsd/nfs4acl.c       |  4 ++--
 fs/ntfs3/file.c         |  2 +-
 fs/ntfs3/namei.c        |  4 ++--
 fs/ocfs2/file.c         |  4 ++--
 fs/ocfs2/namei.c        |  2 +-
 fs/orangefs/inode.c     |  2 +-
 fs/orangefs/namei.c     |  2 +-
 fs/overlayfs/dir.c      |  2 +-
 fs/overlayfs/inode.c    |  6 +++---
 fs/posix_acl.c          | 43 ++++++++++++++++++++++---------------------
 fs/reiserfs/file.c      |  2 +-
 fs/reiserfs/namei.c     |  4 ++--
 fs/reiserfs/xattr_acl.c |  2 +-
 fs/xfs/xfs_iops.c       |  6 +++---
 42 files changed, 89 insertions(+), 88 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 5cfa4b4f070f..0d1a7f2c579d 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -983,14 +983,14 @@ const struct inode_operations v9fs_dir_inode_operations_dotl = {
 	.getattr = v9fs_vfs_getattr_dotl,
 	.setattr = v9fs_vfs_setattr_dotl,
 	.listxattr = v9fs_listxattr,
-	.get_acl = v9fs_iop_get_acl,
+	.get_inode_acl = v9fs_iop_get_acl,
 };
 
 const struct inode_operations v9fs_file_inode_operations_dotl = {
 	.getattr = v9fs_vfs_getattr_dotl,
 	.setattr = v9fs_vfs_setattr_dotl,
 	.listxattr = v9fs_listxattr,
-	.get_acl = v9fs_iop_get_acl,
+	.get_inode_acl = v9fs_iop_get_acl,
 };
 
 const struct inode_operations v9fs_symlink_inode_operations_dotl = {
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index bc9917d372ed..92737166203f 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -177,7 +177,7 @@ static const struct inode_operations bad_inode_ops =
 	.setattr	= bad_inode_setattr,
 	.listxattr	= bad_inode_listxattr,
 	.get_link	= bad_inode_get_link,
-	.get_acl	= bad_inode_get_acl,
+	.get_inode_acl	= bad_inode_get_acl,
 	.fiemap		= bad_inode_fiemap,
 	.update_time	= bad_inode_update_time,
 	.atomic_open	= bad_inode_atomic_open,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 312ba03c56ae..7a3076ccbab4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -11288,7 +11288,7 @@ static const struct inode_operations btrfs_dir_inode_operations = {
 	.mknod		= btrfs_mknod,
 	.listxattr	= btrfs_listxattr,
 	.permission	= btrfs_permission,
-	.get_acl	= btrfs_get_acl,
+	.get_inode_acl	= btrfs_get_acl,
 	.set_acl	= btrfs_set_acl,
 	.update_time	= btrfs_update_time,
 	.tmpfile        = btrfs_tmpfile,
@@ -11341,7 +11341,7 @@ static const struct inode_operations btrfs_file_inode_operations = {
 	.listxattr      = btrfs_listxattr,
 	.permission	= btrfs_permission,
 	.fiemap		= btrfs_fiemap,
-	.get_acl	= btrfs_get_acl,
+	.get_inode_acl	= btrfs_get_acl,
 	.set_acl	= btrfs_set_acl,
 	.update_time	= btrfs_update_time,
 	.fileattr_get	= btrfs_fileattr_get,
@@ -11352,7 +11352,7 @@ static const struct inode_operations btrfs_special_inode_operations = {
 	.setattr	= btrfs_setattr,
 	.permission	= btrfs_permission,
 	.listxattr	= btrfs_listxattr,
-	.get_acl	= btrfs_get_acl,
+	.get_inode_acl	= btrfs_get_acl,
 	.set_acl	= btrfs_set_acl,
 	.update_time	= btrfs_update_time,
 };
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index e7e2ebac330d..6c7026cc8988 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -2033,7 +2033,7 @@ const struct inode_operations ceph_dir_iops = {
 	.getattr = ceph_getattr,
 	.setattr = ceph_setattr,
 	.listxattr = ceph_listxattr,
-	.get_acl = ceph_get_acl,
+	.get_inode_acl = ceph_get_acl,
 	.set_acl = ceph_set_acl,
 	.mknod = ceph_mknod,
 	.symlink = ceph_symlink,
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index ca8aef906dc1..31cd27843eca 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -126,7 +126,7 @@ const struct inode_operations ceph_file_iops = {
 	.setattr = ceph_setattr,
 	.getattr = ceph_getattr,
 	.listxattr = ceph_listxattr,
-	.get_acl = ceph_get_acl,
+	.get_inode_acl = ceph_get_acl,
 	.set_acl = ceph_set_acl,
 };
 
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index ad2a82f2eb4c..2d571343deec 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -371,7 +371,7 @@ int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path,
 const struct inode_operations erofs_generic_iops = {
 	.getattr = erofs_getattr,
 	.listxattr = erofs_listxattr,
-	.get_acl = erofs_get_acl,
+	.get_inode_acl = erofs_get_acl,
 	.fiemap = erofs_fiemap,
 };
 
@@ -379,12 +379,12 @@ const struct inode_operations erofs_symlink_iops = {
 	.get_link = page_get_link,
 	.getattr = erofs_getattr,
 	.listxattr = erofs_listxattr,
-	.get_acl = erofs_get_acl,
+	.get_inode_acl = erofs_get_acl,
 };
 
 const struct inode_operations erofs_fast_symlink_iops = {
 	.get_link = simple_get_link,
 	.getattr = erofs_getattr,
 	.listxattr = erofs_listxattr,
-	.get_acl = erofs_get_acl,
+	.get_inode_acl = erofs_get_acl,
 };
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
index 0dc34721080c..b64a108fac92 100644
--- a/fs/erofs/namei.c
+++ b/fs/erofs/namei.c
@@ -228,6 +228,6 @@ const struct inode_operations erofs_dir_iops = {
 	.lookup = erofs_lookup,
 	.getattr = erofs_getattr,
 	.listxattr = erofs_listxattr,
-	.get_acl = erofs_get_acl,
+	.get_inode_acl = erofs_get_acl,
 	.fiemap = erofs_fiemap,
 };
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index eb97aa3d700e..6b4bebe982ca 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -200,7 +200,7 @@ const struct inode_operations ext2_file_inode_operations = {
 	.listxattr	= ext2_listxattr,
 	.getattr	= ext2_getattr,
 	.setattr	= ext2_setattr,
-	.get_acl	= ext2_get_acl,
+	.get_inode_acl	= ext2_get_acl,
 	.set_acl	= ext2_set_acl,
 	.fiemap		= ext2_fiemap,
 	.fileattr_get	= ext2_fileattr_get,
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 9125eab85146..c056957221a2 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -427,7 +427,7 @@ const struct inode_operations ext2_dir_inode_operations = {
 	.listxattr	= ext2_listxattr,
 	.getattr	= ext2_getattr,
 	.setattr	= ext2_setattr,
-	.get_acl	= ext2_get_acl,
+	.get_inode_acl	= ext2_get_acl,
 	.set_acl	= ext2_set_acl,
 	.tmpfile	= ext2_tmpfile,
 	.fileattr_get	= ext2_fileattr_get,
@@ -438,6 +438,6 @@ const struct inode_operations ext2_special_inode_operations = {
 	.listxattr	= ext2_listxattr,
 	.getattr	= ext2_getattr,
 	.setattr	= ext2_setattr,
-	.get_acl	= ext2_get_acl,
+	.get_inode_acl	= ext2_get_acl,
 	.set_acl	= ext2_set_acl,
 };
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index a7a597c727e6..7ac0a81bd371 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -955,7 +955,7 @@ const struct inode_operations ext4_file_inode_operations = {
 	.setattr	= ext4_setattr,
 	.getattr	= ext4_file_getattr,
 	.listxattr	= ext4_listxattr,
-	.get_acl	= ext4_get_acl,
+	.get_inode_acl	= ext4_get_acl,
 	.set_acl	= ext4_set_acl,
 	.fiemap		= ext4_fiemap,
 	.fileattr_get	= ext4_fileattr_get,
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index e9bc46684106..67a257a69758 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -870,7 +870,7 @@ static int ext4_xattr_credits_for_new_inode(struct inode *dir, mode_t mode,
 	struct super_block *sb = dir->i_sb;
 	int nblocks = 0;
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
-	struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT);
+	struct posix_acl *p = get_inode_acl(dir, ACL_TYPE_DEFAULT);
 
 	if (IS_ERR(p))
 		return PTR_ERR(p);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index d5daaf41e1fc..b8a91d74fdd1 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -4186,7 +4186,7 @@ const struct inode_operations ext4_dir_inode_operations = {
 	.setattr	= ext4_setattr,
 	.getattr	= ext4_getattr,
 	.listxattr	= ext4_listxattr,
-	.get_acl	= ext4_get_acl,
+	.get_inode_acl	= ext4_get_acl,
 	.set_acl	= ext4_set_acl,
 	.fiemap         = ext4_fiemap,
 	.fileattr_get	= ext4_fileattr_get,
@@ -4197,6 +4197,6 @@ const struct inode_operations ext4_special_inode_operations = {
 	.setattr	= ext4_setattr,
 	.getattr	= ext4_getattr,
 	.listxattr	= ext4_listxattr,
-	.get_acl	= ext4_get_acl,
+	.get_inode_acl	= ext4_get_acl,
 	.set_acl	= ext4_set_acl,
 };
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 122339482bdc..83df6f6173d3 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1046,7 +1046,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 const struct inode_operations f2fs_file_inode_operations = {
 	.getattr	= f2fs_getattr,
 	.setattr	= f2fs_setattr,
-	.get_acl	= f2fs_get_acl,
+	.get_inode_acl	= f2fs_get_acl,
 	.set_acl	= f2fs_set_acl,
 	.listxattr	= f2fs_listxattr,
 	.fiemap		= f2fs_fiemap,
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index a389772fd212..c227113b0f26 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -1379,7 +1379,7 @@ const struct inode_operations f2fs_dir_inode_operations = {
 	.tmpfile	= f2fs_tmpfile,
 	.getattr	= f2fs_getattr,
 	.setattr	= f2fs_setattr,
-	.get_acl	= f2fs_get_acl,
+	.get_inode_acl	= f2fs_get_acl,
 	.set_acl	= f2fs_set_acl,
 	.listxattr	= f2fs_listxattr,
 	.fiemap		= f2fs_fiemap,
@@ -1397,7 +1397,7 @@ const struct inode_operations f2fs_symlink_inode_operations = {
 const struct inode_operations f2fs_special_inode_operations = {
 	.getattr	= f2fs_getattr,
 	.setattr	= f2fs_setattr,
-	.get_acl	= f2fs_get_acl,
+	.get_inode_acl	= f2fs_get_acl,
 	.set_acl	= f2fs_set_acl,
 	.listxattr	= f2fs_listxattr,
 };
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index bb97a384dc5d..25e6b0f7e73d 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1935,7 +1935,7 @@ static const struct inode_operations fuse_dir_inode_operations = {
 	.permission	= fuse_permission,
 	.getattr	= fuse_getattr,
 	.listxattr	= fuse_listxattr,
-	.get_acl	= fuse_get_acl,
+	.get_inode_acl	= fuse_get_acl,
 	.set_acl	= fuse_set_acl,
 	.fileattr_get	= fuse_fileattr_get,
 	.fileattr_set	= fuse_fileattr_set,
@@ -1957,7 +1957,7 @@ static const struct inode_operations fuse_common_inode_operations = {
 	.permission	= fuse_permission,
 	.getattr	= fuse_getattr,
 	.listxattr	= fuse_listxattr,
-	.get_acl	= fuse_get_acl,
+	.get_inode_acl	= fuse_get_acl,
 	.set_acl	= fuse_set_acl,
 	.fileattr_get	= fuse_fileattr_get,
 	.fileattr_set	= fuse_fileattr_set,
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 314b9ce70682..1371e067d2a7 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -2149,7 +2149,7 @@ static const struct inode_operations gfs2_file_iops = {
 	.getattr = gfs2_getattr,
 	.listxattr = gfs2_listxattr,
 	.fiemap = gfs2_fiemap,
-	.get_acl = gfs2_get_acl,
+	.get_inode_acl = gfs2_get_acl,
 	.set_acl = gfs2_set_acl,
 	.update_time = gfs2_update_time,
 	.fileattr_get = gfs2_fileattr_get,
@@ -2171,7 +2171,7 @@ static const struct inode_operations gfs2_dir_iops = {
 	.getattr = gfs2_getattr,
 	.listxattr = gfs2_listxattr,
 	.fiemap = gfs2_fiemap,
-	.get_acl = gfs2_get_acl,
+	.get_inode_acl = gfs2_get_acl,
 	.set_acl = gfs2_set_acl,
 	.update_time = gfs2_update_time,
 	.atomic_open = gfs2_atomic_open,
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index c0aabbcbfd58..f399b390b5f6 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -62,7 +62,7 @@ const struct inode_operations jffs2_dir_inode_operations =
 	.rmdir =	jffs2_rmdir,
 	.mknod =	jffs2_mknod,
 	.rename =	jffs2_rename,
-	.get_acl =	jffs2_get_acl,
+	.get_inode_acl =	jffs2_get_acl,
 	.set_acl =	jffs2_set_acl,
 	.setattr =	jffs2_setattr,
 	.listxattr =	jffs2_listxattr,
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index ba86acbe12d3..3cf71befa475 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -64,7 +64,7 @@ const struct file_operations jffs2_file_operations =
 
 const struct inode_operations jffs2_file_inode_operations =
 {
-	.get_acl =	jffs2_get_acl,
+	.get_inode_acl =	jffs2_get_acl,
 	.set_acl =	jffs2_set_acl,
 	.setattr =	jffs2_setattr,
 	.listxattr =	jffs2_listxattr,
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index e3eb9c36751f..88663465aecd 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -133,7 +133,7 @@ const struct inode_operations jfs_file_inode_operations = {
 	.fileattr_get	= jfs_fileattr_get,
 	.fileattr_set	= jfs_fileattr_set,
 #ifdef CONFIG_JFS_POSIX_ACL
-	.get_acl	= jfs_get_acl,
+	.get_inode_acl	= jfs_get_acl,
 	.set_acl	= jfs_set_acl,
 #endif
 };
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 9db4f5789c0e..b50afaf7966f 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1525,7 +1525,7 @@ const struct inode_operations jfs_dir_inode_operations = {
 	.fileattr_get	= jfs_fileattr_get,
 	.fileattr_set	= jfs_fileattr_set,
 #ifdef CONFIG_JFS_POSIX_ACL
-	.get_acl	= jfs_get_acl,
+	.get_inode_acl	= jfs_get_acl,
 	.set_acl	= jfs_set_acl,
 #endif
 };
diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
index 2466edc57424..9306e10753f9 100644
--- a/fs/ksmbd/smb2pdu.c
+++ b/fs/ksmbd/smb2pdu.c
@@ -2487,9 +2487,9 @@ static void ksmbd_acls_fattr(struct smb_fattr *fattr,
 	fattr->cf_dacls = NULL;
 
 	if (IS_ENABLED(CONFIG_FS_POSIX_ACL)) {
-		fattr->cf_acls = get_acl(inode, ACL_TYPE_ACCESS);
+		fattr->cf_acls = get_inode_acl(inode, ACL_TYPE_ACCESS);
 		if (S_ISDIR(inode->i_mode))
-			fattr->cf_dacls = get_acl(inode, ACL_TYPE_DEFAULT);
+			fattr->cf_dacls = get_inode_acl(inode, ACL_TYPE_DEFAULT);
 	}
 }
 
diff --git a/fs/ksmbd/smbacl.c b/fs/ksmbd/smbacl.c
index a1e05fe997fe..ab5c68cc0e13 100644
--- a/fs/ksmbd/smbacl.c
+++ b/fs/ksmbd/smbacl.c
@@ -1289,7 +1289,7 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path,
 	}
 
 	if (IS_ENABLED(CONFIG_FS_POSIX_ACL)) {
-		posix_acls = get_acl(d_inode(path->dentry), ACL_TYPE_ACCESS);
+		posix_acls = get_inode_acl(d_inode(path->dentry), ACL_TYPE_ACCESS);
 		if (posix_acls && !found) {
 			unsigned int id = -1;
 
diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c
index 7dee8b78762d..93f65f01a4a6 100644
--- a/fs/ksmbd/vfs.c
+++ b/fs/ksmbd/vfs.c
@@ -1375,7 +1375,7 @@ static struct xattr_smb_acl *ksmbd_vfs_make_xattr_posix_acl(struct user_namespac
 	if (!IS_ENABLED(CONFIG_FS_POSIX_ACL))
 		return NULL;
 
-	posix_acls = get_acl(inode, acl_type);
+	posix_acls = get_inode_acl(inode, acl_type);
 	if (!posix_acls)
 		return NULL;
 
@@ -1884,7 +1884,7 @@ int ksmbd_vfs_inherit_posix_acl(struct user_namespace *user_ns,
 	if (!IS_ENABLED(CONFIG_FS_POSIX_ACL))
 		return -EOPNOTSUPP;
 
-	acls = get_acl(parent_inode, ACL_TYPE_DEFAULT);
+	acls = get_inode_acl(parent_inode, ACL_TYPE_DEFAULT);
 	if (!acls)
 		return -ENOENT;
 	pace = acls->a_entries;
diff --git a/fs/namei.c b/fs/namei.c
index 578c2110df02..1c80fd23cda2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -297,13 +297,13 @@ static int check_acl(struct user_namespace *mnt_userns,
 		acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
 	        if (!acl)
 	                return -EAGAIN;
-		/* no ->get_acl() calls in RCU mode... */
+		/* no ->get_inode_acl() calls in RCU mode... */
 		if (is_uncached_acl(acl))
 			return -ECHILD;
 	        return posix_acl_permission(mnt_userns, inode, acl, mask);
 	}
 
-	acl = get_acl(inode, ACL_TYPE_ACCESS);
+	acl = get_inode_acl(inode, ACL_TYPE_ACCESS);
 	if (IS_ERR(acl))
 		return PTR_ERR(acl);
 	if (acl) {
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 22890d97a9e4..74d11e3c4205 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -265,14 +265,14 @@ int nfs3_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 	if (S_ISDIR(inode->i_mode)) {
 		switch(type) {
 		case ACL_TYPE_ACCESS:
-			alloc = get_acl(inode, ACL_TYPE_DEFAULT);
+			alloc = get_inode_acl(inode, ACL_TYPE_DEFAULT);
 			if (IS_ERR(alloc))
 				goto fail;
 			dfacl = alloc;
 			break;
 
 		case ACL_TYPE_DEFAULT:
-			alloc = get_acl(inode, ACL_TYPE_ACCESS);
+			alloc = get_inode_acl(inode, ACL_TYPE_ACCESS);
 			if (IS_ERR(alloc))
 				goto fail;
 			dfacl = acl;
@@ -313,7 +313,7 @@ nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data,
 	struct posix_acl *acl;
 	char *p = data + *result;
 
-	acl = get_acl(inode, type);
+	acl = get_inode_acl(inode, type);
 	if (IS_ERR_OR_NULL(acl))
 		return 0;
 
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 2e7579626cf0..4bf208a0a8e9 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -998,7 +998,7 @@ static const struct inode_operations nfs3_dir_inode_operations = {
 	.setattr	= nfs_setattr,
 #ifdef CONFIG_NFS_V3_ACL
 	.listxattr	= nfs3_listxattr,
-	.get_acl	= nfs3_get_acl,
+	.get_inode_acl	= nfs3_get_acl,
 	.set_acl	= nfs3_set_acl,
 #endif
 };
@@ -1009,7 +1009,7 @@ static const struct inode_operations nfs3_file_inode_operations = {
 	.setattr	= nfs_setattr,
 #ifdef CONFIG_NFS_V3_ACL
 	.listxattr	= nfs3_listxattr,
-	.get_acl	= nfs3_get_acl,
+	.get_inode_acl	= nfs3_get_acl,
 	.set_acl	= nfs3_set_acl,
 #endif
 };
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index b1839638500c..c43c25a8da2e 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -55,7 +55,7 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst *rqstp)
 		goto out;
 
 	if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
-		acl = get_acl(inode, ACL_TYPE_ACCESS);
+		acl = get_inode_acl(inode, ACL_TYPE_ACCESS);
 		if (acl == NULL) {
 			/* Solaris returns the inode's minimum ACL. */
 			acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
@@ -69,7 +69,7 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst *rqstp)
 	if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) {
 		/* Check how Solaris handles requests for the Default ACL
 		   of a non-directory! */
-		acl = get_acl(inode, ACL_TYPE_DEFAULT);
+		acl = get_inode_acl(inode, ACL_TYPE_DEFAULT);
 		if (IS_ERR(acl)) {
 			resp->status = nfserrno(PTR_ERR(acl));
 			goto fail;
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index da4a0d09bd84..9daa621817d8 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -47,7 +47,7 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst *rqstp)
 	resp->mask = argp->mask;
 
 	if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
-		acl = get_acl(inode, ACL_TYPE_ACCESS);
+		acl = get_inode_acl(inode, ACL_TYPE_ACCESS);
 		if (acl == NULL) {
 			/* Solaris returns the inode's minimum ACL. */
 			acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
@@ -61,7 +61,7 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst *rqstp)
 	if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) {
 		/* Check how Solaris handles requests for the Default ACL
 		   of a non-directory! */
-		acl = get_acl(inode, ACL_TYPE_DEFAULT);
+		acl = get_inode_acl(inode, ACL_TYPE_DEFAULT);
 		if (IS_ERR(acl)) {
 			resp->status = nfserrno(PTR_ERR(acl));
 			goto fail;
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index bb8e2f6d7d03..518203821790 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -135,7 +135,7 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry,
 	unsigned int flags = 0;
 	int size = 0;
 
-	pacl = get_acl(inode, ACL_TYPE_ACCESS);
+	pacl = get_inode_acl(inode, ACL_TYPE_ACCESS);
 	if (!pacl)
 		pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
 
@@ -147,7 +147,7 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry,
 
 	if (S_ISDIR(inode->i_mode)) {
 		flags = NFS4_ACL_DIR;
-		dpacl = get_acl(inode, ACL_TYPE_DEFAULT);
+		dpacl = get_inode_acl(inode, ACL_TYPE_DEFAULT);
 		if (IS_ERR(dpacl)) {
 			error = PTR_ERR(dpacl);
 			goto rel_pacl;
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index ee5101e6bd68..c5e4a886593d 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -1255,7 +1255,7 @@ const struct inode_operations ntfs_file_inode_operations = {
 	.setattr	= ntfs3_setattr,
 	.listxattr	= ntfs_listxattr,
 	.permission	= ntfs_permission,
-	.get_acl	= ntfs_get_acl,
+	.get_inode_acl	= ntfs_get_acl,
 	.set_acl	= ntfs_set_acl,
 	.fiemap		= ntfs_fiemap,
 };
diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c
index bc22cc321a74..053cc0e0f8b5 100644
--- a/fs/ntfs3/namei.c
+++ b/fs/ntfs3/namei.c
@@ -367,7 +367,7 @@ const struct inode_operations ntfs_dir_inode_operations = {
 	.mknod		= ntfs_mknod,
 	.rename		= ntfs_rename,
 	.permission	= ntfs_permission,
-	.get_acl	= ntfs_get_acl,
+	.get_inode_acl	= ntfs_get_acl,
 	.set_acl	= ntfs_set_acl,
 	.setattr	= ntfs3_setattr,
 	.getattr	= ntfs_getattr,
@@ -379,7 +379,7 @@ const struct inode_operations ntfs_special_inode_operations = {
 	.setattr	= ntfs3_setattr,
 	.getattr	= ntfs_getattr,
 	.listxattr	= ntfs_listxattr,
-	.get_acl	= ntfs_get_acl,
+	.get_inode_acl	= ntfs_get_acl,
 	.set_acl	= ntfs_set_acl,
 };
 // clang-format on
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 9c67edd215d5..af900aaa9275 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2712,7 +2712,7 @@ const struct inode_operations ocfs2_file_iops = {
 	.permission	= ocfs2_permission,
 	.listxattr	= ocfs2_listxattr,
 	.fiemap		= ocfs2_fiemap,
-	.get_acl	= ocfs2_iop_get_acl,
+	.get_inode_acl	= ocfs2_iop_get_acl,
 	.set_acl	= ocfs2_iop_set_acl,
 	.fileattr_get	= ocfs2_fileattr_get,
 	.fileattr_set	= ocfs2_fileattr_set,
@@ -2722,7 +2722,7 @@ const struct inode_operations ocfs2_special_file_iops = {
 	.setattr	= ocfs2_setattr,
 	.getattr	= ocfs2_getattr,
 	.permission	= ocfs2_permission,
-	.get_acl	= ocfs2_iop_get_acl,
+	.get_inode_acl	= ocfs2_iop_get_acl,
 	.set_acl	= ocfs2_iop_set_acl,
 };
 
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 961d1cf54388..c5ffded7ac92 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -2916,7 +2916,7 @@ const struct inode_operations ocfs2_dir_iops = {
 	.permission	= ocfs2_permission,
 	.listxattr	= ocfs2_listxattr,
 	.fiemap         = ocfs2_fiemap,
-	.get_acl	= ocfs2_iop_get_acl,
+	.get_inode_acl	= ocfs2_iop_get_acl,
 	.set_acl	= ocfs2_iop_set_acl,
 	.fileattr_get	= ocfs2_fileattr_get,
 	.fileattr_set	= ocfs2_fileattr_set,
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 825872d8d377..8974b0fbf00d 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -975,7 +975,7 @@ static int orangefs_fileattr_set(struct user_namespace *mnt_userns,
 
 /* ORANGEFS2 implementation of VFS inode operations for files */
 static const struct inode_operations orangefs_file_inode_operations = {
-	.get_acl = orangefs_get_acl,
+	.get_inode_acl = orangefs_get_acl,
 	.set_acl = orangefs_set_acl,
 	.setattr = orangefs_setattr,
 	.getattr = orangefs_getattr,
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index 600e8eee541f..75c1a3dcf68c 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -430,7 +430,7 @@ static int orangefs_rename(struct user_namespace *mnt_userns,
 /* ORANGEFS implementation of VFS inode operations for directories */
 const struct inode_operations orangefs_dir_inode_operations = {
 	.lookup = orangefs_lookup,
-	.get_acl = orangefs_get_acl,
+	.get_inode_acl = orangefs_get_acl,
 	.set_acl = orangefs_set_acl,
 	.create = orangefs_create,
 	.unlink = orangefs_unlink,
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 6b03457f72bb..7bece7010c00 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -1311,7 +1311,7 @@ const struct inode_operations ovl_dir_inode_operations = {
 	.permission	= ovl_permission,
 	.getattr	= ovl_getattr,
 	.listxattr	= ovl_listxattr,
-	.get_acl	= ovl_get_acl,
+	.get_inode_acl	= ovl_get_acl,
 	.update_time	= ovl_update_time,
 	.fileattr_get	= ovl_fileattr_get,
 	.fileattr_set	= ovl_fileattr_set,
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 9e61511de7a7..6eefd8b7868e 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -517,7 +517,7 @@ struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu)
 		const struct cred *old_cred;
 
 		old_cred = ovl_override_creds(inode->i_sb);
-		acl = get_acl(realinode, type);
+		acl = get_inode_acl(realinode, type);
 		revert_creds(old_cred);
 	}
 	/*
@@ -721,7 +721,7 @@ static const struct inode_operations ovl_file_inode_operations = {
 	.permission	= ovl_permission,
 	.getattr	= ovl_getattr,
 	.listxattr	= ovl_listxattr,
-	.get_acl	= ovl_get_acl,
+	.get_inode_acl	= ovl_get_acl,
 	.update_time	= ovl_update_time,
 	.fiemap		= ovl_fiemap,
 	.fileattr_get	= ovl_fileattr_get,
@@ -741,7 +741,7 @@ static const struct inode_operations ovl_special_inode_operations = {
 	.permission	= ovl_permission,
 	.getattr	= ovl_getattr,
 	.listxattr	= ovl_listxattr,
-	.get_acl	= ovl_get_acl,
+	.get_inode_acl	= ovl_get_acl,
 	.update_time	= ovl_update_time,
 };
 
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index c4bc58a1160e..8bbabe477cb2 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -64,7 +64,7 @@ struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type)
 	if (acl == ACL_DONT_CACHE) {
 		struct posix_acl *ret;
 
-		ret = inode->i_op->get_acl(inode, type, LOOKUP_RCU);
+		ret = inode->i_op->get_inode_acl(inode, type, LOOKUP_RCU);
 		if (!IS_ERR(ret))
 			acl = ret;
 	}
@@ -106,7 +106,7 @@ void forget_all_cached_acls(struct inode *inode)
 }
 EXPORT_SYMBOL(forget_all_cached_acls);
 
-struct posix_acl *get_acl(struct inode *inode, int type)
+struct posix_acl *get_inode_acl(struct inode *inode, int type)
 {
 	void *sentinel;
 	struct posix_acl **p;
@@ -114,7 +114,7 @@ struct posix_acl *get_acl(struct inode *inode, int type)
 
 	/*
 	 * The sentinel is used to detect when another operation like
-	 * set_cached_acl() or forget_cached_acl() races with get_acl().
+	 * set_cached_acl() or forget_cached_acl() races with get_inode_acl().
 	 * It is guaranteed that is_uncached_acl(sentinel) is true.
 	 */
 
@@ -133,24 +133,24 @@ struct posix_acl *get_acl(struct inode *inode, int type)
 	 * current value of the ACL will not be ACL_NOT_CACHED and so our own
 	 * sentinel will not be set; another task will update the cache.  We
 	 * could wait for that other task to complete its job, but it's easier
-	 * to just call ->get_acl to fetch the ACL ourself.  (This is going to
-	 * be an unlikely race.)
+	 * to just call ->get_inode_acl to fetch the ACL ourself.  (This is
+	 * going to be an unlikely race.)
 	 */
 	cmpxchg(p, ACL_NOT_CACHED, sentinel);
 
 	/*
-	 * Normally, the ACL returned by ->get_acl will be cached.
+	 * Normally, the ACL returned by ->get_inode_acl will be cached.
 	 * A filesystem can prevent that by calling
-	 * forget_cached_acl(inode, type) in ->get_acl.
+	 * forget_cached_acl(inode, type) in ->get_inode_acl.
 	 *
-	 * If the filesystem doesn't have a get_acl() function at all, we'll
-	 * just create the negative cache entry.
+	 * If the filesystem doesn't have a get_inode_acl() function at all,
+	 * we'll just create the negative cache entry.
 	 */
-	if (!inode->i_op->get_acl) {
+	if (!inode->i_op->get_inode_acl) {
 		set_cached_acl(inode, type, NULL);
 		return NULL;
 	}
-	acl = inode->i_op->get_acl(inode, type, false);
+	acl = inode->i_op->get_inode_acl(inode, type, false);
 
 	if (IS_ERR(acl)) {
 		/*
@@ -169,7 +169,7 @@ struct posix_acl *get_acl(struct inode *inode, int type)
 		posix_acl_release(acl);
 	return acl;
 }
-EXPORT_SYMBOL(get_acl);
+EXPORT_SYMBOL(get_inode_acl);
 
 /*
  * Init a fresh posix_acl
@@ -600,7 +600,7 @@ int
 	if (!inode->i_op->set_acl)
 		return -EOPNOTSUPP;
 
-	acl = get_acl(inode, ACL_TYPE_ACCESS);
+	acl = get_inode_acl(inode, ACL_TYPE_ACCESS);
 	if (IS_ERR_OR_NULL(acl)) {
 		if (acl == ERR_PTR(-EOPNOTSUPP))
 			return 0;
@@ -630,7 +630,7 @@ posix_acl_create(struct inode *dir, umode_t *mode,
 	if (S_ISLNK(*mode) || !IS_POSIXACL(dir))
 		return 0;
 
-	p = get_acl(dir, ACL_TYPE_DEFAULT);
+	p = get_inode_acl(dir, ACL_TYPE_DEFAULT);
 	if (!p || p == ERR_PTR(-EOPNOTSUPP)) {
 		*mode &= ~current_umask();
 		return 0;
@@ -1045,7 +1045,8 @@ posix_acl_from_xattr_kgid(struct user_namespace *mnt_userns,
  * Filesystems that store POSIX ACLs in the unaltered uapi format should use
  * posix_acl_from_xattr() when reading them from the backing store and
  * converting them into the struct posix_acl VFS format. The helper is
- * specifically intended to be called from the ->get_acl() inode operation.
+ * specifically intended to be called from the ->get_inode_acl() inode
+ * operation.
  *
  * The posix_acl_from_xattr() function will map the raw {g,u}id values stored
  * in ACL_{GROUP,USER} entries into the filesystem idmapping in @fs_userns. The
@@ -1053,11 +1054,11 @@ posix_acl_from_xattr_kgid(struct user_namespace *mnt_userns,
  * correct k{g,u}id_t. The returned struct posix_acl can be cached.
  *
  * Note that posix_acl_from_xattr() does not take idmapped mounts into account.
- * If it did it calling is from the ->get_acl() inode operation would return
- * POSIX ACLs mapped according to an idmapped mount which would mean that the
- * value couldn't be cached for the filesystem. Idmapped mounts are taken into
- * account on the fly during permission checking or right at the VFS -
- * userspace boundary before reporting them to the user.
+ * If it did it calling is from the ->get_inode_acl() inode operation would
+ * return POSIX ACLs mapped according to an idmapped mount which would mean
+ * that the value couldn't be cached for the filesystem. Idmapped mounts are
+ * taken into account on the fly during permission checking or right at the VFS
+ * - userspace boundary before reporting them to the user.
  *
  * Return: Allocated struct posix_acl on success, NULL for a valid header but
  *         without actual POSIX ACL entries, or ERR_PTR() encoded error code.
@@ -1127,7 +1128,7 @@ posix_acl_xattr_get(const struct xattr_handler *handler,
 	if (S_ISLNK(inode->i_mode))
 		return -EOPNOTSUPP;
 
-	acl = get_acl(inode, handler->flags);
+	acl = get_inode_acl(inode, handler->flags);
 	if (IS_ERR(acl))
 		return PTR_ERR(acl);
 	if (acl == NULL)
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 6e228bfbe7ef..467d13da198f 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -256,7 +256,7 @@ const struct inode_operations reiserfs_file_inode_operations = {
 	.setattr = reiserfs_setattr,
 	.listxattr = reiserfs_listxattr,
 	.permission = reiserfs_permission,
-	.get_acl = reiserfs_get_acl,
+	.get_inode_acl = reiserfs_get_acl,
 	.set_acl = reiserfs_set_acl,
 	.fileattr_get = reiserfs_fileattr_get,
 	.fileattr_set = reiserfs_fileattr_set,
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 3d7a35d6a18b..4d428e8704bc 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -1659,7 +1659,7 @@ const struct inode_operations reiserfs_dir_inode_operations = {
 	.setattr = reiserfs_setattr,
 	.listxattr = reiserfs_listxattr,
 	.permission = reiserfs_permission,
-	.get_acl = reiserfs_get_acl,
+	.get_inode_acl = reiserfs_get_acl,
 	.set_acl = reiserfs_set_acl,
 	.fileattr_get = reiserfs_fileattr_get,
 	.fileattr_set = reiserfs_fileattr_set,
@@ -1683,6 +1683,6 @@ const struct inode_operations reiserfs_special_inode_operations = {
 	.setattr = reiserfs_setattr,
 	.listxattr = reiserfs_listxattr,
 	.permission = reiserfs_permission,
-	.get_acl = reiserfs_get_acl,
+	.get_inode_acl = reiserfs_get_acl,
 	.set_acl = reiserfs_set_acl,
 };
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 966ba48e33ec..93fe414fed18 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -372,7 +372,7 @@ int reiserfs_cache_default_acl(struct inode *inode)
 	if (IS_PRIVATE(inode))
 		return 0;
 
-	acl = get_acl(inode, ACL_TYPE_DEFAULT);
+	acl = get_inode_acl(inode, ACL_TYPE_DEFAULT);
 
 	if (acl && !IS_ERR(acl)) {
 		int size = reiserfs_acl_size(acl->a_count);
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ab266ba65a84..712238305bc3 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1103,7 +1103,7 @@ xfs_vn_tmpfile(
 }
 
 static const struct inode_operations xfs_inode_operations = {
-	.get_acl		= xfs_get_acl,
+	.get_inode_acl		= xfs_get_acl,
 	.set_acl		= xfs_set_acl,
 	.getattr		= xfs_vn_getattr,
 	.setattr		= xfs_vn_setattr,
@@ -1130,7 +1130,7 @@ static const struct inode_operations xfs_dir_inode_operations = {
 	.rmdir			= xfs_vn_unlink,
 	.mknod			= xfs_vn_mknod,
 	.rename			= xfs_vn_rename,
-	.get_acl		= xfs_get_acl,
+	.get_inode_acl		= xfs_get_acl,
 	.set_acl		= xfs_set_acl,
 	.getattr		= xfs_vn_getattr,
 	.setattr		= xfs_vn_setattr,
@@ -1157,7 +1157,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
 	.rmdir			= xfs_vn_unlink,
 	.mknod			= xfs_vn_mknod,
 	.rename			= xfs_vn_rename,
-	.get_acl		= xfs_get_acl,
+	.get_inode_acl		= xfs_get_acl,
 	.set_acl		= xfs_set_acl,
 	.getattr		= xfs_vn_getattr,
 	.setattr		= xfs_vn_setattr,
-- 
cgit 


From bd9684b042dcbb400ecb4d169b74c9adc84aa088 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 22 Sep 2022 17:17:02 +0200
Subject: cifs: implement get acl method

The current way of setting and getting posix acls through the generic
xattr interface is error prone and type unsafe. The vfs needs to
interpret and fixup posix acls before storing or reporting it to
userspace. Various hacks exist to make this work. The code is hard to
understand and difficult to maintain in it's current form. Instead of
making this work by hacking posix acls through xattr handlers we are
building a dedicated posix acl api around the get and set inode
operations. This removes a lot of hackiness and makes the codepaths
easier to maintain. A lot of background can be found in [1].

In order to build a type safe posix api around get and set acl we need
all filesystem to implement get and set acl.

So far cifs wasn't able to implement get and set acl inode operations
because it needs access to the dentry. Now that we extended the set acl
inode operation to take a dentry argument and added a new get acl inode
operation that takes a dentry argument we can let cifs implement get and
set acl inode operations.

This is mostly a copy and paste of the codepaths currently used in cifs'
posix acl xattr handler. After we have fully implemented the posix acl
api and switched the vfs over to it, the cifs specific posix acl xattr
handler and associated code will be removed and the code duplication
will go away.

Note, until the vfs has been switched to the new posix acl api this
patch is a non-functional change.

Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1]
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/cifs/cifsacl.c   |  67 ++++++++++++++++++
 fs/cifs/cifsfs.c    |   2 +
 fs/cifs/cifsproto.h |   6 ++
 fs/cifs/cifssmb.c   | 196 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 271 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index fa480d62f313..a6730e0eb57b 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -13,6 +13,7 @@
 #include <linux/string.h>
 #include <linux/keyctl.h>
 #include <linux/key-type.h>
+#include <uapi/linux/posix_acl.h>
 #include <keys/user-type.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
@@ -20,6 +21,8 @@
 #include "cifsproto.h"
 #include "cifs_debug.h"
 #include "fs_context.h"
+#include "cifs_fs_sb.h"
+#include "cifs_unicode.h"
 
 /* security id for everyone/world system group */
 static const struct cifs_sid sid_everyone = {
@@ -1668,3 +1671,67 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode,
 	kfree(pntsd);
 	return rc;
 }
+
+struct posix_acl *cifs_get_acl(struct user_namespace *mnt_userns,
+			       struct dentry *dentry, int type)
+{
+#if defined(CONFIG_CIFS_ALLOW_INSECURE_LEGACY) && defined(CONFIG_CIFS_POSIX)
+	struct posix_acl *acl = NULL;
+	ssize_t rc = -EOPNOTSUPP;
+	unsigned int xid;
+	struct super_block *sb = dentry->d_sb;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+	struct tcon_link *tlink;
+	struct cifs_tcon *pTcon;
+	const char *full_path;
+	void *page;
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return ERR_CAST(tlink);
+	pTcon = tlink_tcon(tlink);
+
+	xid = get_xid();
+	page = alloc_dentry_path();
+
+	full_path = build_path_from_dentry(dentry, page);
+	if (IS_ERR(full_path)) {
+		acl = ERR_CAST(full_path);
+		goto out;
+	}
+
+	/* return alt name if available as pseudo attr */
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		if (sb->s_flags & SB_POSIXACL)
+			rc = cifs_do_get_acl(xid, pTcon, full_path, &acl,
+					     ACL_TYPE_ACCESS,
+					     cifs_sb->local_nls,
+					     cifs_remap(cifs_sb));
+		break;
+
+	case ACL_TYPE_DEFAULT:
+		if (sb->s_flags & SB_POSIXACL)
+			rc = cifs_do_get_acl(xid, pTcon, full_path, &acl,
+					     ACL_TYPE_DEFAULT,
+					     cifs_sb->local_nls,
+					     cifs_remap(cifs_sb));
+		break;
+	}
+
+	if (rc < 0) {
+		if (rc == -EINVAL)
+			acl = ERR_PTR(-EOPNOTSUPP);
+		else
+			acl = ERR_PTR(rc);
+	}
+
+out:
+	free_dentry_path(page);
+	free_xid(xid);
+	cifs_put_tlink(tlink);
+	return acl;
+#else
+	return ERR_PTR(-EOPNOTSUPP);
+#endif
+}
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index c6ac19223ddc..05d0f84ede95 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1133,6 +1133,7 @@ const struct inode_operations cifs_dir_inode_ops = {
 	.symlink = cifs_symlink,
 	.mknod   = cifs_mknod,
 	.listxattr = cifs_listxattr,
+	.get_acl = cifs_get_acl,
 };
 
 const struct inode_operations cifs_file_inode_ops = {
@@ -1141,6 +1142,7 @@ const struct inode_operations cifs_file_inode_ops = {
 	.permission = cifs_permission,
 	.listxattr = cifs_listxattr,
 	.fiemap = cifs_fiemap,
+	.get_acl = cifs_get_acl,
 };
 
 const struct inode_operations cifs_symlink_inode_ops = {
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 83e83d8beabb..28e848495594 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -224,6 +224,8 @@ extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *,
 				      const char *, u32 *, u32);
 extern struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *,
 				const struct cifs_fid *, u32 *, u32);
+extern struct posix_acl *cifs_get_acl(struct user_namespace *mnt_userns,
+				      struct dentry *dentry, int type);
 extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *,
 				const char *, int);
 extern unsigned int setup_authusers_ACE(struct cifs_ace *pace);
@@ -541,6 +543,10 @@ extern int CIFSSMBGetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
 		const unsigned char *searchName,
 		char *acl_inf, const int buflen, const int acl_type,
 		const struct nls_table *nls_codepage, int remap_special_chars);
+extern int cifs_do_get_acl(const unsigned int xid, struct cifs_tcon *tcon,
+			   const unsigned char *searchName,
+			   struct posix_acl **acl, const int acl_type,
+			   const struct nls_table *nls_codepage, int remap);
 extern int CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
 		const unsigned char *fileName,
 		const char *local_acl, const int buflen, const int acl_type,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 1724066c1536..d30535406885 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -3212,6 +3212,202 @@ setACLerrorExit:
 	return rc;
 }
 
+#ifdef CONFIG_FS_POSIX_ACL
+/**
+ * cifs_init_posix_acl - convert ACL from cifs to POSIX ACL format
+ * @ace: POSIX ACL entry to store converted ACL into
+ * @cifs_ace: ACL in cifs format
+ *
+ * Convert an Access Control Entry from wire format to local POSIX xattr
+ * format.
+ *
+ * Note that the @cifs_uid member is used to store both {g,u}id_t.
+ */
+static void cifs_init_posix_acl(struct posix_acl_entry *ace,
+				struct cifs_posix_ace *cifs_ace)
+{
+	/* u8 cifs fields do not need le conversion */
+	ace->e_perm = cifs_ace->cifs_e_perm;
+	ace->e_tag = cifs_ace->cifs_e_tag;
+
+	switch (ace->e_tag) {
+	case ACL_USER:
+		ace->e_uid = make_kuid(&init_user_ns,
+				       le64_to_cpu(cifs_ace->cifs_uid));
+		break;
+	case ACL_GROUP:
+		ace->e_gid = make_kgid(&init_user_ns,
+				       le64_to_cpu(cifs_ace->cifs_uid));
+		break;
+	}
+	return;
+}
+
+/**
+ * cifs_to_posix_acl - copy cifs ACL format to POSIX ACL format
+ * @acl: ACLs returned in POSIX ACL format
+ * @src: ACLs in cifs format
+ * @acl_type: type of POSIX ACL requested
+ * @size_of_data_area: size of SMB we got
+ *
+ * This function converts ACLs from cifs format to POSIX ACL format.
+ * If @acl is NULL then the size of the buffer required to store POSIX ACLs in
+ * their uapi format is returned.
+ */
+static int cifs_to_posix_acl(struct posix_acl **acl, char *src,
+			     const int acl_type, const int size_of_data_area)
+{
+	int size =  0;
+	__u16 count;
+	struct cifs_posix_ace *pACE;
+	struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)src;
+	struct posix_acl *kacl = NULL;
+	struct posix_acl_entry *pa, *pe;
+
+	if (le16_to_cpu(cifs_acl->version) != CIFS_ACL_VERSION)
+		return -EOPNOTSUPP;
+
+	if (acl_type == ACL_TYPE_ACCESS) {
+		count = le16_to_cpu(cifs_acl->access_entry_count);
+		pACE = &cifs_acl->ace_array[0];
+		size = sizeof(struct cifs_posix_acl);
+		size += sizeof(struct cifs_posix_ace) * count;
+		/* check if we would go beyond end of SMB */
+		if (size_of_data_area < size) {
+			cifs_dbg(FYI, "bad CIFS POSIX ACL size %d vs. %d\n",
+				 size_of_data_area, size);
+			return -EINVAL;
+		}
+	} else if (acl_type == ACL_TYPE_DEFAULT) {
+		count = le16_to_cpu(cifs_acl->access_entry_count);
+		size = sizeof(struct cifs_posix_acl);
+		size += sizeof(struct cifs_posix_ace) * count;
+		/* skip past access ACEs to get to default ACEs */
+		pACE = &cifs_acl->ace_array[count];
+		count = le16_to_cpu(cifs_acl->default_entry_count);
+		size += sizeof(struct cifs_posix_ace) * count;
+		/* check if we would go beyond end of SMB */
+		if (size_of_data_area < size)
+			return -EINVAL;
+	} else {
+		/* illegal type */
+		return -EINVAL;
+	}
+
+	/* Allocate number of POSIX ACLs to store in VFS format. */
+	kacl = posix_acl_alloc(count, GFP_NOFS);
+	if (!kacl)
+		return -ENOMEM;
+
+	FOREACH_ACL_ENTRY(pa, kacl, pe) {
+		cifs_init_posix_acl(pa, pACE);
+		pACE++;
+	}
+
+	*acl = kacl;
+	return 0;
+}
+
+int cifs_do_get_acl(const unsigned int xid, struct cifs_tcon *tcon,
+		    const unsigned char *searchName, struct posix_acl **acl,
+		    const int acl_type, const struct nls_table *nls_codepage,
+		    int remap)
+{
+/* SMB_QUERY_POSIX_ACL */
+	TRANSACTION2_QPI_REQ *pSMB = NULL;
+	TRANSACTION2_QPI_RSP *pSMBr = NULL;
+	int rc = 0;
+	int bytes_returned;
+	int name_len;
+	__u16 params, byte_count;
+
+	cifs_dbg(FYI, "In GetPosixACL (Unix) for path %s\n", searchName);
+
+queryAclRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		(void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+			cifsConvertToUTF16((__le16 *) pSMB->FileName,
+					   searchName, PATH_MAX, nls_codepage,
+					   remap);
+		name_len++;     /* trailing null */
+		name_len *= 2;
+		pSMB->FileName[name_len] = 0;
+		pSMB->FileName[name_len+1] = 0;
+	} else {
+		name_len = copy_path_name(pSMB->FileName, searchName);
+	}
+
+	params = 2 /* level */  + 4 /* rsrvd */  + name_len /* incl null */ ;
+	pSMB->TotalDataCount = 0;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find exact max data count below from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(4000);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	pSMB->ParameterOffset = cpu_to_le16(
+		offsetof(struct smb_com_transaction2_qpi_req,
+			 InformationLevel) - 4);
+	pSMB->DataCount = 0;
+	pSMB->DataOffset = 0;
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION);
+	byte_count = params + 1 /* pad */ ;
+	pSMB->TotalParameterCount = cpu_to_le16(params);
+	pSMB->ParameterCount = pSMB->TotalParameterCount;
+	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_POSIX_ACL);
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+		(struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	cifs_stats_inc(&tcon->stats.cifs_stats.num_acl_get);
+	if (rc) {
+		cifs_dbg(FYI, "Send error in Query POSIX ACL = %d\n", rc);
+	} else {
+		/* decode response */
+
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+		/* BB also check enough total bytes returned */
+		if (rc || get_bcc(&pSMBr->hdr) < 2)
+			rc = -EIO;      /* bad smb */
+		else {
+			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+			__u16 count = le16_to_cpu(pSMBr->t2.DataCount);
+			rc = cifs_to_posix_acl(acl,
+				(char *)&pSMBr->hdr.Protocol+data_offset,
+				acl_type, count);
+		}
+	}
+	cifs_buf_release(pSMB);
+	/*
+	 * The else branch after SendReceive() doesn't return EAGAIN so if we
+	 * allocated @acl in cifs_to_posix_acl() we are guaranteed to return
+	 * here and don't leak POSIX ACLs.
+	 */
+	if (rc == -EAGAIN)
+		goto queryAclRetry;
+	return rc;
+}
+#else
+int cifs_do_get_acl(const unsigned int xid, struct cifs_tcon *tcon,
+		    const unsigned char *searchName, struct posix_acl **acl,
+		    const int acl_type, const struct nls_table *nls_codepage,
+		    int remap)
+{
+	return -EOPNOTSUPP;
+}
+#endif /* CONFIG_FS_POSIX_ACL */
+
 int
 CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon,
 	       const int netfid, __u64 *pExtAttrBits, __u64 *pMask)
-- 
cgit 


From dc1af4c4b4721abfa07b351063825acc7e69cc66 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 22 Sep 2022 17:17:03 +0200
Subject: cifs: implement set acl method

The current way of setting and getting posix acls through the generic
xattr interface is error prone and type unsafe. The vfs needs to
interpret and fixup posix acls before storing or reporting it to
userspace. Various hacks exist to make this work. The code is hard to
understand and difficult to maintain in it's current form. Instead of
making this work by hacking posix acls through xattr handlers we are
building a dedicated posix acl api around the get and set inode
operations. This removes a lot of hackiness and makes the codepaths
easier to maintain. A lot of background can be found in [1].

In order to build a type safe posix api around get and set acl we need
all filesystem to implement get and set acl.

So far cifs wasn't able to implement get and set acl inode operations
because it needs access to the dentry. Now that we extended the set acl
inode operation to take a dentry argument and added a new get acl inode
operation that takes a dentry argument we can let cifs implement get and
set acl inode operations.

This is mostly a copy and paste of the codepaths currently used in cifs'
posix acl xattr handler. After we have fully implemented the posix acl
api and switched the vfs over to it, the cifs specific posix acl xattr
handler and associated code will be removed and the code duplication
will go away.

Note, until the vfs has been switched to the new posix acl api this
patch is a non-functional change.

Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1]
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/cifs/cifsacl.c   |  72 +++++++++++++++++++++++
 fs/cifs/cifsfs.c    |   2 +
 fs/cifs/cifsproto.h |   6 ++
 fs/cifs/cifssmb.c   | 160 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 240 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index a6730e0eb57b..6a9f03c882dc 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -14,6 +14,8 @@
 #include <linux/keyctl.h>
 #include <linux/key-type.h>
 #include <uapi/linux/posix_acl.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
 #include <keys/user-type.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
@@ -1735,3 +1737,73 @@ out:
 	return ERR_PTR(-EOPNOTSUPP);
 #endif
 }
+
+int cifs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+		 struct posix_acl *acl, int type)
+{
+#if defined(CONFIG_CIFS_ALLOW_INSECURE_LEGACY) && defined(CONFIG_CIFS_POSIX)
+	int rc = -EOPNOTSUPP;
+	unsigned int xid;
+	struct super_block *sb = dentry->d_sb;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+	struct tcon_link *tlink;
+	struct cifs_tcon *pTcon;
+	const char *full_path;
+	void *page;
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	pTcon = tlink_tcon(tlink);
+
+	xid = get_xid();
+	page = alloc_dentry_path();
+
+	full_path = build_path_from_dentry(dentry, page);
+	if (IS_ERR(full_path)) {
+		rc = PTR_ERR(full_path);
+		goto out;
+	}
+	/* return dos attributes as pseudo xattr */
+	/* return alt name if available as pseudo attr */
+
+	/* if proc/fs/cifs/streamstoxattr is set then
+		search server for EAs or streams to
+		returns as xattrs */
+	if (posix_acl_xattr_size(acl->a_count) > CIFSMaxBufSize) {
+		cifs_dbg(FYI, "size of EA value too large\n");
+		rc = -EOPNOTSUPP;
+		goto out;
+	}
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		if (!acl)
+			goto out;
+		if (sb->s_flags & SB_POSIXACL)
+			rc = cifs_do_set_acl(xid, pTcon, full_path, acl,
+					     ACL_TYPE_ACCESS,
+					     cifs_sb->local_nls,
+					     cifs_remap(cifs_sb));
+		break;
+
+	case ACL_TYPE_DEFAULT:
+		if (!acl)
+			goto out;
+		if (sb->s_flags & SB_POSIXACL)
+			rc = cifs_do_set_acl(xid, pTcon, full_path, acl,
+					     ACL_TYPE_DEFAULT,
+					     cifs_sb->local_nls,
+					     cifs_remap(cifs_sb));
+		break;
+	}
+
+out:
+	free_dentry_path(page);
+	free_xid(xid);
+	cifs_put_tlink(tlink);
+	return rc;
+#else
+	return -EOPNOTSUPP;
+#endif
+}
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 05d0f84ede95..9d8f077a23c4 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1134,6 +1134,7 @@ const struct inode_operations cifs_dir_inode_ops = {
 	.mknod   = cifs_mknod,
 	.listxattr = cifs_listxattr,
 	.get_acl = cifs_get_acl,
+	.set_acl = cifs_set_acl,
 };
 
 const struct inode_operations cifs_file_inode_ops = {
@@ -1143,6 +1144,7 @@ const struct inode_operations cifs_file_inode_ops = {
 	.listxattr = cifs_listxattr,
 	.fiemap = cifs_fiemap,
 	.get_acl = cifs_get_acl,
+	.set_acl = cifs_set_acl,
 };
 
 const struct inode_operations cifs_symlink_inode_ops = {
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 28e848495594..ce7f08a387b5 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -226,6 +226,8 @@ extern struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *,
 				const struct cifs_fid *, u32 *, u32);
 extern struct posix_acl *cifs_get_acl(struct user_namespace *mnt_userns,
 				      struct dentry *dentry, int type);
+extern int cifs_set_acl(struct user_namespace *mnt_userns,
+			struct dentry *dentry, struct posix_acl *acl, int type);
 extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *,
 				const char *, int);
 extern unsigned int setup_authusers_ACE(struct cifs_ace *pace);
@@ -551,6 +553,10 @@ extern int CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
 		const unsigned char *fileName,
 		const char *local_acl, const int buflen, const int acl_type,
 		const struct nls_table *nls_codepage, int remap_special_chars);
+extern int cifs_do_set_acl(const unsigned int xid, struct cifs_tcon *tcon,
+			   const unsigned char *fileName,
+			   const struct posix_acl *acl, const int acl_type,
+			   const struct nls_table *nls_codepage, int remap);
 extern int CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon,
 			const int netfid, __u64 *pExtAttrBits, __u64 *pMask);
 #endif /* CIFS_ALLOW_INSECURE_LEGACY */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index d30535406885..f119ca917947 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -3308,6 +3308,83 @@ static int cifs_to_posix_acl(struct posix_acl **acl, char *src,
 	return 0;
 }
 
+/**
+ * cifs_init_ace - convert ACL entry from POSIX ACL to cifs format
+ * @cifs_ace: the cifs ACL entry to store into
+ * @local_ace: the POSIX ACL entry to convert
+ */
+static void cifs_init_ace(struct cifs_posix_ace *cifs_ace,
+			  const struct posix_acl_entry *local_ace)
+{
+	cifs_ace->cifs_e_perm = local_ace->e_perm;
+	cifs_ace->cifs_e_tag =  local_ace->e_tag;
+
+	switch (local_ace->e_tag) {
+	case ACL_USER:
+		cifs_ace->cifs_uid =
+			cpu_to_le64(from_kuid(&init_user_ns, local_ace->e_uid));
+		break;
+	case ACL_GROUP:
+		cifs_ace->cifs_uid =
+			cpu_to_le64(from_kgid(&init_user_ns, local_ace->e_gid));
+		break;
+	default:
+		cifs_ace->cifs_uid = cpu_to_le64(-1);
+	}
+}
+
+/**
+ * posix_acl_to_cifs - convert ACLs from POSIX ACL to cifs format
+ * @parm_data: ACLs in cifs format to conver to
+ * @acl: ACLs in POSIX ACL format to convert from
+ * @acl_type: the type of POSIX ACLs stored in @acl
+ *
+ * Return: the number cifs ACL entries after conversion
+ */
+static __u16 posix_acl_to_cifs(char *parm_data, const struct posix_acl *acl,
+			       const int acl_type)
+{
+	__u16 rc = 0;
+	struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)parm_data;
+	const struct posix_acl_entry *pa, *pe;
+	int count;
+	int i = 0;
+
+	if ((acl == NULL) || (cifs_acl == NULL))
+		return 0;
+
+	count = acl->a_count;
+	cifs_dbg(FYI, "setting acl with %d entries\n", count);
+
+	/*
+	 * Note that the uapi POSIX ACL version is verified by the VFS and is
+	 * independent of the cifs ACL version. Changing the POSIX ACL version
+	 * is a uapi change and if it's changed we will pass down the POSIX ACL
+	 * version in struct posix_acl from the VFS. For now there's really
+	 * only one that all filesystems know how to deal with.
+	 */
+	cifs_acl->version = cpu_to_le16(1);
+	if (acl_type == ACL_TYPE_ACCESS) {
+		cifs_acl->access_entry_count = cpu_to_le16(count);
+		cifs_acl->default_entry_count = cpu_to_le16(0xFFFF);
+	} else if (acl_type == ACL_TYPE_DEFAULT) {
+		cifs_acl->default_entry_count = cpu_to_le16(count);
+		cifs_acl->access_entry_count = cpu_to_le16(0xFFFF);
+	} else {
+		cifs_dbg(FYI, "unknown ACL type %d\n", acl_type);
+		return 0;
+	}
+	FOREACH_ACL_ENTRY(pa, acl, pe) {
+		cifs_init_ace(&cifs_acl->ace_array[i++], pa);
+	}
+	if (rc == 0) {
+		rc = (__u16)(count * sizeof(struct cifs_posix_ace));
+		rc += sizeof(struct cifs_posix_acl);
+		/* BB add check to make sure ACL does not overflow SMB */
+	}
+	return rc;
+}
+
 int cifs_do_get_acl(const unsigned int xid, struct cifs_tcon *tcon,
 		    const unsigned char *searchName, struct posix_acl **acl,
 		    const int acl_type, const struct nls_table *nls_codepage,
@@ -3398,6 +3475,81 @@ queryAclRetry:
 		goto queryAclRetry;
 	return rc;
 }
+
+int cifs_do_set_acl(const unsigned int xid, struct cifs_tcon *tcon,
+		    const unsigned char *fileName, const struct posix_acl *acl,
+		    const int acl_type, const struct nls_table *nls_codepage,
+		    int remap)
+{
+	struct smb_com_transaction2_spi_req *pSMB = NULL;
+	struct smb_com_transaction2_spi_rsp *pSMBr = NULL;
+	char *parm_data;
+	int name_len;
+	int rc = 0;
+	int bytes_returned = 0;
+	__u16 params, byte_count, data_count, param_offset, offset;
+
+	cifs_dbg(FYI, "In SetPosixACL (Unix) for path %s\n", fileName);
+setAclRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+			cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+					   PATH_MAX, nls_codepage, remap);
+		name_len++;     /* trailing null */
+		name_len *= 2;
+	} else {
+		name_len = copy_path_name(pSMB->FileName, fileName);
+	}
+	params = 6 + name_len;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find max SMB size from sess */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_spi_req,
+				InformationLevel) - 4;
+	offset = param_offset + params;
+	parm_data = ((char *) &pSMB->hdr.Protocol) + offset;
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+
+	/* convert to on the wire format for POSIX ACL */
+	data_count = posix_acl_to_cifs(parm_data, acl, acl_type);
+
+	if (data_count == 0) {
+		rc = -EOPNOTSUPP;
+		goto setACLerrorExit;
+	}
+	pSMB->DataOffset = cpu_to_le16(offset);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION);
+	pSMB->InformationLevel = cpu_to_le16(SMB_SET_POSIX_ACL);
+	byte_count = 3 /* pad */  + params + data_count;
+	pSMB->DataCount = cpu_to_le16(data_count);
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->Reserved4 = 0;
+	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc)
+		cifs_dbg(FYI, "Set POSIX ACL returned %d\n", rc);
+
+setACLerrorExit:
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto setAclRetry;
+	return rc;
+}
 #else
 int cifs_do_get_acl(const unsigned int xid, struct cifs_tcon *tcon,
 		    const unsigned char *searchName, struct posix_acl **acl,
@@ -3406,6 +3558,14 @@ int cifs_do_get_acl(const unsigned int xid, struct cifs_tcon *tcon,
 {
 	return -EOPNOTSUPP;
 }
+
+int cifs_do_set_acl(const unsigned int xid, struct cifs_tcon *tcon,
+		    const unsigned char *fileName, const struct posix_acl *acl,
+		    const int acl_type, const struct nls_table *nls_codepage,
+		    int remap)
+{
+	return -EOPNOTSUPP;
+}
 #endif /* CONFIG_FS_POSIX_ACL */
 
 int
-- 
cgit 


From 6cd4d4e8b6e1495eb0cafa3a59d1fde137a98d22 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 22 Sep 2022 17:17:04 +0200
Subject: 9p: implement get acl method

The current way of setting and getting posix acls through the generic
xattr interface is error prone and type unsafe. The vfs needs to
interpret and fixup posix acls before storing or reporting it to
userspace. Various hacks exist to make this work. The code is hard to
understand and difficult to maintain in it's current form. Instead of
making this work by hacking posix acls through xattr handlers we are
building a dedicated posix acl api around the get and set inode
operations. This removes a lot of hackiness and makes the codepaths
easier to maintain. A lot of background can be found in [1].

In order to build a type safe posix api around get and set acl we need
all filesystem to implement get and set acl.

So far 9p implemented a ->get_inode_acl() operation that didn't require
access to the dentry in order to allow (limited) permission checking via
posix acls in the vfs. Now that we have get and set acl inode operations
that take a dentry argument we can give 9p get and set acl inode
operations.

This is mostly a refactoring of the codepaths currently used in 9p posix
acl xattr handler. After we have fully implemented the posix acl api and
switched the vfs over to it, the 9p specific posix acl xattr handler and
associated code will be removed.

Note, until the vfs has been switched to the new posix acl api this
patch is a non-functional change.

Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1]
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/9p/acl.c            | 80 ++++++++++++++++++++++++++++++++++++++------------
 fs/9p/acl.h            |  5 +++-
 fs/9p/vfs_inode_dotl.c |  6 ++--
 3 files changed, 69 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 4dac4a0dc5f4..67f8b57c67e0 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -17,34 +17,64 @@
 #include "v9fs_vfs.h"
 #include "fid.h"
 
-static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
+static struct posix_acl *v9fs_fid_get_acl(struct p9_fid *fid, const char *name)
 {
 	ssize_t size;
 	void *value = NULL;
 	struct posix_acl *acl = NULL;
 
 	size = v9fs_fid_xattr_get(fid, name, NULL, 0);
-	if (size > 0) {
-		value = kzalloc(size, GFP_NOFS);
-		if (!value)
-			return ERR_PTR(-ENOMEM);
-		size = v9fs_fid_xattr_get(fid, name, value, size);
-		if (size > 0) {
-			acl = posix_acl_from_xattr(&init_user_ns, value, size);
-			if (IS_ERR(acl))
-				goto err_out;
-		}
-	} else if (size == -ENODATA || size == 0 ||
-		   size == -ENOSYS || size == -EOPNOTSUPP) {
-		acl = NULL;
-	} else
-		acl = ERR_PTR(-EIO);
-
-err_out:
+	if (size < 0)
+		return ERR_PTR(size);
+	if (size == 0)
+		return ERR_PTR(-ENODATA);
+
+	value = kzalloc(size, GFP_NOFS);
+	if (!value)
+		return ERR_PTR(-ENOMEM);
+
+	size = v9fs_fid_xattr_get(fid, name, value, size);
+	if (size < 0)
+		acl = ERR_PTR(size);
+	else if (size == 0)
+		acl = ERR_PTR(-ENODATA);
+	else
+		acl = posix_acl_from_xattr(&init_user_ns, value, size);
 	kfree(value);
 	return acl;
 }
 
+static struct posix_acl *v9fs_acl_get(struct dentry *dentry, const char *name)
+{
+	struct p9_fid *fid;
+	struct posix_acl *acl = NULL;
+
+	fid = v9fs_fid_lookup(dentry);
+	if (IS_ERR(fid))
+		return ERR_CAST(fid);
+
+	acl = v9fs_fid_get_acl(fid, name);
+	p9_fid_put(fid);
+	return acl;
+}
+
+static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, const char *name)
+{
+	int retval;
+	struct posix_acl *acl = NULL;
+
+	acl = v9fs_fid_get_acl(fid, name);
+	if (!IS_ERR(acl))
+		return acl;
+
+	retval = PTR_ERR(acl);
+	if (retval == -ENODATA || retval == -ENOSYS || retval == -EOPNOTSUPP)
+		return NULL;
+
+	/* map everything else to -EIO */
+	return ERR_PTR(-EIO);
+}
+
 int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
 {
 	int retval = 0;
@@ -89,7 +119,7 @@ static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type)
 	return acl;
 }
 
-struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type, bool rcu)
+struct posix_acl *v9fs_iop_get_inode_acl(struct inode *inode, int type, bool rcu)
 {
 	struct v9fs_session_info *v9ses;
 
@@ -109,6 +139,18 @@ struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type, bool rcu)
 
 }
 
+struct posix_acl *v9fs_iop_get_acl(struct user_namespace *mnt_userns,
+				   struct dentry *dentry, int type)
+{
+	struct v9fs_session_info *v9ses;
+
+	v9ses = v9fs_dentry2v9ses(dentry);
+	/* We allow set/get/list of acl when access=client is not specified. */
+	if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
+		return v9fs_acl_get(dentry, posix_acl_xattr_name(type));
+	return v9fs_get_cached_acl(d_inode(dentry), type);
+}
+
 static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl)
 {
 	int retval;
diff --git a/fs/9p/acl.h b/fs/9p/acl.h
index ce5175d463dd..359dab4da900 100644
--- a/fs/9p/acl.h
+++ b/fs/9p/acl.h
@@ -8,8 +8,10 @@
 
 #ifdef CONFIG_9P_FS_POSIX_ACL
 int v9fs_get_acl(struct inode *inode, struct p9_fid *fid);
-struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type,
+struct posix_acl *v9fs_iop_get_inode_acl(struct inode *inode, int type,
 				   bool rcu);
+struct posix_acl *v9fs_iop_get_acl(struct user_namespace *mnt_userns,
+					  struct dentry *dentry, int type);
 int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid);
 int v9fs_set_create_acl(struct inode *inode, struct p9_fid *fid,
 			struct posix_acl *dacl, struct posix_acl *acl);
@@ -17,6 +19,7 @@ int v9fs_acl_mode(struct inode *dir, umode_t *modep,
 		  struct posix_acl **dpacl, struct posix_acl **pacl);
 void v9fs_put_acl(struct posix_acl *dacl, struct posix_acl *acl);
 #else
+#define v9fs_iop_get_inode_acl	NULL
 #define v9fs_iop_get_acl NULL
 static inline int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
 {
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 0d1a7f2c579d..a4211fcb9168 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -983,14 +983,16 @@ const struct inode_operations v9fs_dir_inode_operations_dotl = {
 	.getattr = v9fs_vfs_getattr_dotl,
 	.setattr = v9fs_vfs_setattr_dotl,
 	.listxattr = v9fs_listxattr,
-	.get_inode_acl = v9fs_iop_get_acl,
+	.get_inode_acl = v9fs_iop_get_inode_acl,
+	.get_acl = v9fs_iop_get_acl,
 };
 
 const struct inode_operations v9fs_file_inode_operations_dotl = {
 	.getattr = v9fs_vfs_getattr_dotl,
 	.setattr = v9fs_vfs_setattr_dotl,
 	.listxattr = v9fs_listxattr,
-	.get_inode_acl = v9fs_iop_get_acl,
+	.get_inode_acl = v9fs_iop_get_inode_acl,
+	.get_acl = v9fs_iop_get_acl,
 };
 
 const struct inode_operations v9fs_symlink_inode_operations_dotl = {
-- 
cgit 


From 079da629383ea960dfa0615e18d51d8bd121bd0c Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 22 Sep 2022 17:17:05 +0200
Subject: 9p: implement set acl method

The current way of setting and getting posix acls through the generic
xattr interface is error prone and type unsafe. The vfs needs to
interpret and fixup posix acls before storing or reporting it to
userspace. Various hacks exist to make this work. The code is hard to
understand and difficult to maintain in it's current form. Instead of
making this work by hacking posix acls through xattr handlers we are
building a dedicated posix acl api around the get and set inode
operations. This removes a lot of hackiness and makes the codepaths
easier to maintain. A lot of background can be found in [1].

In order to build a type safe posix api around get and set acl we need
all filesystem to implement get and set acl.

So far 9p implemented a ->get_inode_acl() operation that didn't require
access to the dentry in order to allow (limited) permission checking via
posix acls in the vfs. Now that we have get and set acl inode operations
that take a dentry argument we can give 9p get and set acl inode
operations.

This is mostly a light refactoring of the codepaths currently used in 9p
posix acl xattr handler. After we have fully implemented the posix acl
api and switched the vfs over to it, the 9p specific posix acl xattr
handler and associated code will be removed.

Note, until the vfs has been switched to the new posix acl api this
patch is a non-functional change.

Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1]
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/9p/acl.c            | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/9p/acl.h            |  3 ++
 fs/9p/vfs_inode_dotl.c |  2 ++
 3 files changed, 99 insertions(+)

(limited to 'fs')

diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 67f8b57c67e0..135b26cee63a 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -151,6 +151,100 @@ struct posix_acl *v9fs_iop_get_acl(struct user_namespace *mnt_userns,
 	return v9fs_get_cached_acl(d_inode(dentry), type);
 }
 
+int v9fs_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+		     struct posix_acl *acl, int type)
+{
+	int retval;
+	size_t size = 0;
+	void *value = NULL;
+	const char *acl_name;
+	struct v9fs_session_info *v9ses;
+	struct inode *inode = d_inode(dentry);
+
+	if (acl) {
+		retval = posix_acl_valid(inode->i_sb->s_user_ns, acl);
+		if (retval)
+			goto err_out;
+
+		size = posix_acl_xattr_size(acl->a_count);
+
+		value = kzalloc(size, GFP_NOFS);
+		if (!value) {
+			retval = -ENOMEM;
+			goto err_out;
+		}
+
+		retval = posix_acl_to_xattr(&init_user_ns, acl, value, size);
+		if (retval < 0)
+			goto err_out;
+	}
+
+	/*
+	 * set the attribute on the remote. Without even looking at the
+	 * xattr value. We leave it to the server to validate
+	 */
+	acl_name = posix_acl_xattr_name(type);
+	v9ses = v9fs_dentry2v9ses(dentry);
+	if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
+		retval = v9fs_xattr_set(dentry, acl_name, value, size, 0);
+		goto err_out;
+	}
+
+	if (S_ISLNK(inode->i_mode)) {
+		retval = -EOPNOTSUPP;
+		goto err_out;
+	}
+
+	if (!inode_owner_or_capable(&init_user_ns, inode)) {
+		retval = -EPERM;
+		goto err_out;
+	}
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		if (acl) {
+			struct iattr iattr = {};
+			struct posix_acl *acl_mode = acl;
+
+			retval = posix_acl_update_mode(&init_user_ns, inode,
+						       &iattr.ia_mode,
+						       &acl_mode);
+			if (retval)
+				goto err_out;
+			if (!acl_mode) {
+				/*
+				 * ACL can be represented by the mode bits.
+				 * So don't update ACL below.
+				 */
+				kfree(value);
+				value = NULL;
+				size = 0;
+			}
+			iattr.ia_valid = ATTR_MODE;
+			/*
+			 * FIXME should we update ctime ?
+			 * What is the following setxattr update the mode ?
+			 */
+			v9fs_vfs_setattr_dotl(&init_user_ns, dentry, &iattr);
+		}
+		break;
+	case ACL_TYPE_DEFAULT:
+		if (!S_ISDIR(inode->i_mode)) {
+			retval = acl ? -EINVAL : 0;
+			goto err_out;
+		}
+		break;
+	}
+
+	retval = v9fs_xattr_set(dentry, acl_name, value, size, 0);
+	if (!retval)
+		set_cached_acl(inode, type, acl);
+
+err_out:
+	kfree(value);
+	return retval;
+}
+
 static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl)
 {
 	int retval;
diff --git a/fs/9p/acl.h b/fs/9p/acl.h
index 359dab4da900..4c60a2bce5de 100644
--- a/fs/9p/acl.h
+++ b/fs/9p/acl.h
@@ -12,6 +12,8 @@ struct posix_acl *v9fs_iop_get_inode_acl(struct inode *inode, int type,
 				   bool rcu);
 struct posix_acl *v9fs_iop_get_acl(struct user_namespace *mnt_userns,
 					  struct dentry *dentry, int type);
+int v9fs_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+		     struct posix_acl *acl, int type);
 int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid);
 int v9fs_set_create_acl(struct inode *inode, struct p9_fid *fid,
 			struct posix_acl *dacl, struct posix_acl *acl);
@@ -21,6 +23,7 @@ void v9fs_put_acl(struct posix_acl *dacl, struct posix_acl *acl);
 #else
 #define v9fs_iop_get_inode_acl	NULL
 #define v9fs_iop_get_acl NULL
+#define v9fs_iop_set_acl NULL
 static inline int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
 {
 	return 0;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index a4211fcb9168..03c1743c4aff 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -985,6 +985,7 @@ const struct inode_operations v9fs_dir_inode_operations_dotl = {
 	.listxattr = v9fs_listxattr,
 	.get_inode_acl = v9fs_iop_get_inode_acl,
 	.get_acl = v9fs_iop_get_acl,
+	.set_acl = v9fs_iop_set_acl,
 };
 
 const struct inode_operations v9fs_file_inode_operations_dotl = {
@@ -993,6 +994,7 @@ const struct inode_operations v9fs_file_inode_operations_dotl = {
 	.listxattr = v9fs_listxattr,
 	.get_inode_acl = v9fs_iop_get_inode_acl,
 	.get_acl = v9fs_iop_get_acl,
+	.set_acl = v9fs_iop_set_acl,
 };
 
 const struct inode_operations v9fs_symlink_inode_operations_dotl = {
-- 
cgit 


From 56851bc9b9f072dd738f25ed29c0d5abe9f2908b Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 29 Sep 2022 10:47:36 +0200
Subject: internal: add may_write_xattr()

Split out the generic checks whether an inode allows writing xattrs. Since
security.* and system.* xattrs don't have any restrictions and we're going
to split out posix acls into a dedicated api we will use this helper to
check whether we can write posix acls.

Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/internal.h |  1 +
 fs/xattr.c    | 43 ++++++++++++++++++++++++++++++-------------
 2 files changed, 31 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/internal.h b/fs/internal.h
index 6f0386b34fae..aa5f240496d9 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -232,5 +232,6 @@ ssize_t do_getxattr(struct user_namespace *mnt_userns,
 int setxattr_copy(const char __user *name, struct xattr_ctx *ctx);
 int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 		struct xattr_ctx *ctx);
+int may_write_xattr(struct user_namespace *mnt_userns, struct inode *inode);
 
 ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos);
diff --git a/fs/xattr.c b/fs/xattr.c
index 61107b6bbed2..31b5ac65ca34 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -80,6 +80,31 @@ xattr_resolve_name(struct inode *inode, const char **name)
 	return ERR_PTR(-EOPNOTSUPP);
 }
 
+/**
+ * may_write_xattr - check whether inode allows writing xattr
+ * @mnt_userns:	User namespace of the mount the inode was found from
+ * @inode: the inode on which to set an xattr
+ *
+ * Check whether the inode allows writing xattrs. Specifically, we can never
+ * set or remove an extended attribute on a read-only filesystem  or on an
+ * immutable / append-only inode.
+ *
+ * We also need to ensure that the inode has a mapping in the mount to
+ * not risk writing back invalid i_{g,u}id values.
+ *
+ * Return: On success zero is returned. On error a negative errno is returned.
+ */
+int may_write_xattr(struct user_namespace *mnt_userns, struct inode *inode)
+{
+	if (IS_IMMUTABLE(inode))
+		return -EPERM;
+	if (IS_APPEND(inode))
+		return -EPERM;
+	if (HAS_UNMAPPED_ID(mnt_userns, inode))
+		return -EPERM;
+	return 0;
+}
+
 /*
  * Check permissions for extended attribute access.  This is a bit complicated
  * because different namespaces have very different rules.
@@ -88,20 +113,12 @@ static int
 xattr_permission(struct user_namespace *mnt_userns, struct inode *inode,
 		 const char *name, int mask)
 {
-	/*
-	 * We can never set or remove an extended attribute on a read-only
-	 * filesystem  or on an immutable / append-only inode.
-	 */
 	if (mask & MAY_WRITE) {
-		if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-			return -EPERM;
-		/*
-		 * Updating an xattr will likely cause i_uid and i_gid
-		 * to be writen back improperly if their true value is
-		 * unknown to the vfs.
-		 */
-		if (HAS_UNMAPPED_ID(mnt_userns, inode))
-			return -EPERM;
+		int ret;
+
+		ret = may_write_xattr(mnt_userns, inode);
+		if (ret)
+			return ret;
 	}
 
 	/*
-- 
cgit 


From e4cc9163032fed6ff27dd03325ddc54f88863a24 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 22 Sep 2022 17:17:06 +0200
Subject: acl: add vfs_set_acl()

In previous patches we implemented get and set inode operations for all
non-stacking filesystems that support posix acls but didn't yet
implement get and/or set acl inode operations. This specifically
affected cifs and 9p.

Now we can build a posix acl api based solely on get and set inode
operations. We add a new vfs_set_acl() api that can be used to set posix
acls. This finally removes all type unsafety and type conversion issues
explained in detail in [1] that we aim to get rid of.

After we finished building the vfs api we can switch stacking
filesystems to rely on the new posix api and then finally switch the
xattr system calls themselves to rely on the posix acl api.

Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1]
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/posix_acl.c | 107 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 107 insertions(+)

(limited to 'fs')

diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 8bbabe477cb2..970250506f07 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -25,6 +25,11 @@
 #include <linux/namei.h>
 #include <linux/mnt_idmapping.h>
 #include <linux/iversion.h>
+#include <linux/security.h>
+#include <linux/evm.h>
+#include <linux/fsnotify.h>
+
+#include "internal.h"
 
 static struct posix_acl **acl_by_type(struct inode *inode, int type)
 {
@@ -1257,3 +1262,105 @@ int simple_acl_create(struct inode *dir, struct inode *inode)
 		posix_acl_release(acl);
 	return 0;
 }
+
+static int vfs_set_acl_idmapped_mnt(struct user_namespace *mnt_userns,
+				    struct user_namespace *fs_userns,
+				    struct posix_acl *acl)
+{
+	for (int n = 0; n < acl->a_count; n++) {
+		struct posix_acl_entry *acl_e = &acl->a_entries[n];
+
+		switch (acl_e->e_tag) {
+		case ACL_USER:
+			acl_e->e_uid = from_vfsuid(mnt_userns, fs_userns,
+						   VFSUIDT_INIT(acl_e->e_uid));
+			break;
+		case ACL_GROUP:
+			acl_e->e_gid = from_vfsgid(mnt_userns, fs_userns,
+						   VFSGIDT_INIT(acl_e->e_gid));
+			break;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * vfs_set_acl - set posix acls
+ * @mnt_userns: user namespace of the mount
+ * @dentry: the dentry based on which to set the posix acls
+ * @acl_name: the name of the posix acl
+ * @kacl: the posix acls in the appropriate VFS format
+ *
+ * This function sets @kacl. The caller must all posix_acl_release() on @kacl
+ * afterwards.
+ *
+ * Return: On success 0, on error negative errno.
+ */
+int vfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+		const char *acl_name, struct posix_acl *kacl)
+{
+	int acl_type;
+	int error;
+	struct inode *inode = d_inode(dentry);
+	struct inode *delegated_inode = NULL;
+
+	acl_type = posix_acl_type(acl_name);
+	if (acl_type < 0)
+		return -EINVAL;
+
+	if (kacl) {
+		/*
+		 * If we're on an idmapped mount translate from mount specific
+		 * vfs{g,u}id_t into global filesystem k{g,u}id_t.
+		 * Afterwards we can cache the POSIX ACLs filesystem wide and -
+		 * if this is a filesystem with a backing store - ultimately
+		 * translate them to backing store values.
+		 */
+		error = vfs_set_acl_idmapped_mnt(mnt_userns, i_user_ns(inode), kacl);
+		if (error)
+			return error;
+	}
+
+retry_deleg:
+	inode_lock(inode);
+
+	/*
+	 * We only care about restrictions the inode struct itself places upon
+	 * us otherwise POSIX ACLs aren't subject to any VFS restrictions.
+	 */
+	error = may_write_xattr(mnt_userns, inode);
+	if (error)
+		goto out_inode_unlock;
+
+	error = security_inode_set_acl(mnt_userns, dentry, acl_name, kacl);
+	if (error)
+		goto out_inode_unlock;
+
+	error = try_break_deleg(inode, &delegated_inode);
+	if (error)
+		goto out_inode_unlock;
+
+	if (inode->i_opflags & IOP_XATTR)
+		error = set_posix_acl(mnt_userns, dentry, acl_type, kacl);
+	else if (unlikely(is_bad_inode(inode)))
+		error = -EIO;
+	else
+		error = -EOPNOTSUPP;
+	if (!error) {
+		fsnotify_xattr(dentry);
+		evm_inode_post_set_acl(dentry, acl_name, kacl);
+	}
+
+out_inode_unlock:
+	inode_unlock(inode);
+
+	if (delegated_inode) {
+		error = break_deleg_wait(&delegated_inode);
+		if (!error)
+			goto retry_deleg;
+	}
+
+	return error;
+}
+EXPORT_SYMBOL_GPL(vfs_set_acl);
-- 
cgit 


From 4f353ba4a9f42ad283dc6afdd84dae0b1d294842 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 22 Sep 2022 17:17:13 +0200
Subject: acl: add vfs_get_acl()

In previous patches we implemented get and set inode operations for all
non-stacking filesystems that support posix acls but didn't yet
implement get and/or set acl inode operations. This specifically
affected cifs and 9p.

Now we can build a posix acl api based solely on get and set inode
operations. We add a new vfs_get_acl() api that can be used to get posix
acls. This finally removes all type unsafety and type conversion issues
explained in detail in [1] that we aim to get rid of.

After we finished building the vfs api we can switch stacking
filesystems to rely on the new posix api and then finally switch the
xattr system calls themselves to rely on the posix acl api.

Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1]
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/posix_acl.c | 129 +++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 122 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 970250506f07..af04e177f019 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -111,7 +111,9 @@ void forget_all_cached_acls(struct inode *inode)
 }
 EXPORT_SYMBOL(forget_all_cached_acls);
 
-struct posix_acl *get_inode_acl(struct inode *inode, int type)
+static struct posix_acl *__get_acl(struct user_namespace *mnt_userns,
+				   struct dentry *dentry, struct inode *inode,
+				   int type)
 {
 	void *sentinel;
 	struct posix_acl **p;
@@ -144,19 +146,21 @@ struct posix_acl *get_inode_acl(struct inode *inode, int type)
 	cmpxchg(p, ACL_NOT_CACHED, sentinel);
 
 	/*
-	 * Normally, the ACL returned by ->get_inode_acl will be cached.
+	 * Normally, the ACL returned by ->get{_inode}_acl will be cached.
 	 * A filesystem can prevent that by calling
-	 * forget_cached_acl(inode, type) in ->get_inode_acl.
+	 * forget_cached_acl(inode, type) in ->get{_inode}_acl.
 	 *
-	 * If the filesystem doesn't have a get_inode_acl() function at all,
+	 * If the filesystem doesn't have a get{_inode}_ acl() function at all,
 	 * we'll just create the negative cache entry.
 	 */
-	if (!inode->i_op->get_inode_acl) {
+	if (dentry && inode->i_op->get_acl) {
+		acl = inode->i_op->get_acl(mnt_userns, dentry, type);
+	} else if (inode->i_op->get_inode_acl) {
+		acl = inode->i_op->get_inode_acl(inode, type, false);
+	} else {
 		set_cached_acl(inode, type, NULL);
 		return NULL;
 	}
-	acl = inode->i_op->get_inode_acl(inode, type, false);
-
 	if (IS_ERR(acl)) {
 		/*
 		 * Remove our sentinel so that we don't block future attempts
@@ -174,6 +178,11 @@ struct posix_acl *get_inode_acl(struct inode *inode, int type)
 		posix_acl_release(acl);
 	return acl;
 }
+
+struct posix_acl *get_inode_acl(struct inode *inode, int type)
+{
+	return __get_acl(&init_user_ns, NULL, inode, type);
+}
 EXPORT_SYMBOL(get_inode_acl);
 
 /*
@@ -1120,6 +1129,67 @@ posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
 }
 EXPORT_SYMBOL (posix_acl_to_xattr);
 
+/**
+ * vfs_posix_acl_to_xattr - convert from kernel to userspace representation
+ * @mnt_userns: user namespace of the mount
+ * @inode: inode the posix acls are set on
+ * @acl: the posix acls as represented by the vfs
+ * @buffer: the buffer into which to convert @acl
+ * @size: size of @buffer
+ *
+ * This converts @acl from the VFS representation in the filesystem idmapping
+ * to the uapi form reportable to userspace. And mount and caller idmappings
+ * are handled appropriately.
+ *
+ * Return: On success, the size of the stored uapi posix acls, on error a
+ * negative errno.
+ */
+ssize_t vfs_posix_acl_to_xattr(struct user_namespace *mnt_userns,
+			       struct inode *inode, const struct posix_acl *acl,
+			       void *buffer, size_t size)
+
+{
+	struct posix_acl_xattr_header *ext_acl = buffer;
+	struct posix_acl_xattr_entry *ext_entry;
+	struct user_namespace *fs_userns, *caller_userns;
+	ssize_t real_size, n;
+	vfsuid_t vfsuid;
+	vfsgid_t vfsgid;
+
+	real_size = posix_acl_xattr_size(acl->a_count);
+	if (!buffer)
+		return real_size;
+	if (real_size > size)
+		return -ERANGE;
+
+	ext_entry = (void *)(ext_acl + 1);
+	ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
+
+	fs_userns = i_user_ns(inode);
+	caller_userns = current_user_ns();
+	for (n=0; n < acl->a_count; n++, ext_entry++) {
+		const struct posix_acl_entry *acl_e = &acl->a_entries[n];
+		ext_entry->e_tag  = cpu_to_le16(acl_e->e_tag);
+		ext_entry->e_perm = cpu_to_le16(acl_e->e_perm);
+		switch(acl_e->e_tag) {
+		case ACL_USER:
+			vfsuid = make_vfsuid(mnt_userns, fs_userns, acl_e->e_uid);
+			ext_entry->e_id = cpu_to_le32(from_kuid(
+				caller_userns, vfsuid_into_kuid(vfsuid)));
+			break;
+		case ACL_GROUP:
+			vfsgid = make_vfsgid(mnt_userns, fs_userns, acl_e->e_gid);
+			ext_entry->e_id = cpu_to_le32(from_kgid(
+				caller_userns, vfsgid_into_kgid(vfsgid)));
+			break;
+		default:
+			ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
+			break;
+		}
+	}
+	return real_size;
+}
+
 static int
 posix_acl_xattr_get(const struct xattr_handler *handler,
 		    struct dentry *unused, struct inode *inode,
@@ -1364,3 +1434,48 @@ out_inode_unlock:
 	return error;
 }
 EXPORT_SYMBOL_GPL(vfs_set_acl);
+
+/**
+ * vfs_get_acl - get posix acls
+ * @mnt_userns: user namespace of the mount
+ * @dentry: the dentry based on which to retrieve the posix acls
+ * @acl_name: the name of the posix acl
+ *
+ * This function retrieves @kacl from the filesystem. The caller must all
+ * posix_acl_release() on @kacl.
+ *
+ * Return: On success POSIX ACLs in VFS format, on error negative errno.
+ */
+struct posix_acl *vfs_get_acl(struct user_namespace *mnt_userns,
+			      struct dentry *dentry, const char *acl_name)
+{
+	struct inode *inode = d_inode(dentry);
+	struct posix_acl *acl;
+	int acl_type, error;
+
+	acl_type = posix_acl_type(acl_name);
+	if (acl_type < 0)
+		return ERR_PTR(-EINVAL);
+
+	/*
+	 * The VFS has no restrictions on reading POSIX ACLs so calling
+	 * something like xattr_permission() isn't needed. Only LSMs get a say.
+	 */
+	error = security_inode_get_acl(mnt_userns, dentry, acl_name);
+	if (error)
+		return ERR_PTR(error);
+
+	if (!IS_POSIXACL(inode))
+		return ERR_PTR(-EOPNOTSUPP);
+	if (S_ISLNK(inode->i_mode))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	acl = __get_acl(mnt_userns, dentry, inode, acl_type);
+	if (IS_ERR(acl))
+		return acl;
+	if (!acl)
+		return ERR_PTR(-ENODATA);
+
+	return acl;
+}
+EXPORT_SYMBOL_GPL(vfs_get_acl);
-- 
cgit 


From aeb7f00542af48ac63e448de46d672cfd79a7069 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 22 Sep 2022 17:17:14 +0200
Subject: acl: add vfs_remove_acl()

In previous patches we implemented get and set inode operations for all
non-stacking filesystems that support posix acls but didn't yet
implement get and/or set acl inode operations. This specifically
affected cifs and 9p.

Now we can build a posix acl api based solely on get and set inode
operations. We add a new vfs_remove_acl() api that can be used to set
posix acls. This finally removes all type unsafety and type conversion
issues explained in detail in [1] that we aim to get rid of.

After we finished building the vfs api we can switch stacking
filesystems to rely on the new posix api and then finally switch the
xattr system calls themselves to rely on the posix acl api.

Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1]
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/posix_acl.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

(limited to 'fs')

diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index af04e177f019..3a141082b24c 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -1479,3 +1479,68 @@ struct posix_acl *vfs_get_acl(struct user_namespace *mnt_userns,
 	return acl;
 }
 EXPORT_SYMBOL_GPL(vfs_get_acl);
+
+/**
+ * vfs_remove_acl - remove posix acls
+ * @mnt_userns: user namespace of the mount
+ * @dentry: the dentry based on which to retrieve the posix acls
+ * @acl_name: the name of the posix acl
+ *
+ * This function removes posix acls.
+ *
+ * Return: On success 0, on error negative errno.
+ */
+int vfs_remove_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+		   const char *acl_name)
+{
+	int acl_type;
+	int error;
+	struct inode *inode = d_inode(dentry);
+	struct inode *delegated_inode = NULL;
+
+	acl_type = posix_acl_type(acl_name);
+	if (acl_type < 0)
+		return -EINVAL;
+
+retry_deleg:
+	inode_lock(inode);
+
+	/*
+	 * We only care about restrictions the inode struct itself places upon
+	 * us otherwise POSIX ACLs aren't subject to any VFS restrictions.
+	 */
+	error = may_write_xattr(mnt_userns, inode);
+	if (error)
+		goto out_inode_unlock;
+
+	error = security_inode_remove_acl(mnt_userns, dentry, acl_name);
+	if (error)
+		goto out_inode_unlock;
+
+	error = try_break_deleg(inode, &delegated_inode);
+	if (error)
+		goto out_inode_unlock;
+
+	if (inode->i_opflags & IOP_XATTR)
+		error = set_posix_acl(mnt_userns, dentry, acl_type, NULL);
+	else if (unlikely(is_bad_inode(inode)))
+		error = -EIO;
+	else
+		error = -EOPNOTSUPP;
+	if (!error) {
+		fsnotify_xattr(dentry);
+		evm_inode_post_remove_acl(mnt_userns, dentry, acl_name);
+	}
+
+out_inode_unlock:
+	inode_unlock(inode);
+
+	if (delegated_inode) {
+		error = break_deleg_wait(&delegated_inode);
+		if (!error)
+			goto retry_deleg;
+	}
+
+	return error;
+}
+EXPORT_SYMBOL_GPL(vfs_remove_acl);
-- 
cgit 


From b82784a2f52a7a2a5491d36f0c257cf64d87abb5 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 22 Sep 2022 17:17:16 +0200
Subject: ksmbd: use vfs_remove_acl()

The current way of setting and getting posix acls through the generic
xattr interface is error prone and type unsafe. The vfs needs to
interpret and fixup posix acls before storing or reporting it to
userspace. Various hacks exist to make this work. The code is hard to
understand and difficult to maintain in it's current form. Instead of
making this work by hacking posix acls through xattr handlers we are
building a dedicated posix acl api around the get and set inode
operations. This removes a lot of hackiness and makes the codepaths
easier to maintain. A lot of background can be found in [1].

Now that we've switched all filesystems that can serve as the lower
filesystem for ksmbd we can switch ksmbd over to rely on
the posix acl api. Note that this is orthogonal to switching the vfs
itself over.

Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1]
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/ksmbd/vfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c
index 93f65f01a4a6..7ccda91b0dc4 100644
--- a/fs/ksmbd/vfs.c
+++ b/fs/ksmbd/vfs.c
@@ -1321,7 +1321,7 @@ int ksmbd_vfs_remove_acl_xattrs(struct user_namespace *user_ns,
 			     sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1) ||
 		    !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT,
 			     sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1)) {
-			err = ksmbd_vfs_remove_xattr(user_ns, dentry, name);
+			err = vfs_remove_acl(user_ns, dentry, name);
 			if (err)
 				ksmbd_debug(SMB,
 					    "remove acl xattr failed : %s\n", name);
-- 
cgit 


From af84016f1cfe78b23c8efbf6ba5c3c660d52a9ee Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 22 Sep 2022 17:17:17 +0200
Subject: ecryptfs: implement get acl method

The current way of setting and getting posix acls through the generic
xattr interface is error prone and type unsafe. The vfs needs to
interpret and fixup posix acls before storing or reporting it to
userspace. Various hacks exist to make this work. The code is hard to
understand and difficult to maintain in it's current form. Instead of
making this work by hacking posix acls through xattr handlers we are
building a dedicated posix acl api around the get and set inode
operations. This removes a lot of hackiness and makes the codepaths
easier to maintain. A lot of background can be found in [1].

In order to build a type safe posix api around get and set acl we need
all filesystem to implement get and set acl.

So far ecryptfs didn't implement get and set acl inode operations
because it wanted easy access to the dentry. Now that we extended the
set acl inode operation to take a dentry argument and added a new get
acl inode operation that takes a dentry argument we can let ecryptfs
implement get and set acl inode operations.

Note, until the vfs has been switched to the new posix acl api this
patch is a non-functional change.

Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1]
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/ecryptfs/inode.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index c214fe0981bd..b4be9f6bafd2 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -18,6 +18,8 @@
 #include <linux/fs_stack.h>
 #include <linux/slab.h>
 #include <linux/xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
 #include <linux/fileattr.h>
 #include <asm/unaligned.h>
 #include "ecryptfs_kernel.h"
@@ -1120,6 +1122,13 @@ static int ecryptfs_fileattr_set(struct user_namespace *mnt_userns,
 	return rc;
 }
 
+static struct posix_acl *ecryptfs_get_acl(struct user_namespace *mnt_userns,
+					  struct dentry *dentry, int type)
+{
+	return vfs_get_acl(mnt_userns, ecryptfs_dentry_to_lower(dentry),
+			   posix_acl_xattr_name(type));
+}
+
 const struct inode_operations ecryptfs_symlink_iops = {
 	.get_link = ecryptfs_get_link,
 	.permission = ecryptfs_permission,
@@ -1143,6 +1152,7 @@ const struct inode_operations ecryptfs_dir_iops = {
 	.listxattr = ecryptfs_listxattr,
 	.fileattr_get = ecryptfs_fileattr_get,
 	.fileattr_set = ecryptfs_fileattr_set,
+	.get_acl = ecryptfs_get_acl,
 };
 
 const struct inode_operations ecryptfs_main_iops = {
@@ -1152,6 +1162,7 @@ const struct inode_operations ecryptfs_main_iops = {
 	.listxattr = ecryptfs_listxattr,
 	.fileattr_get = ecryptfs_fileattr_get,
 	.fileattr_set = ecryptfs_fileattr_set,
+	.get_acl = ecryptfs_get_acl,
 };
 
 static int ecryptfs_xattr_get(const struct xattr_handler *handler,
-- 
cgit 


From 86c261b9eb4ce0a4ae189c0317cd8eddb8302d5a Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 22 Sep 2022 17:17:18 +0200
Subject: ecryptfs: implement set acl method

The current way of setting and getting posix acls through the generic
xattr interface is error prone and type unsafe. The vfs needs to
interpret and fixup posix acls before storing or reporting it to
userspace. Various hacks exist to make this work. The code is hard to
understand and difficult to maintain in it's current form. Instead of
making this work by hacking posix acls through xattr handlers we are
building a dedicated posix acl api around the get and set inode
operations. This removes a lot of hackiness and makes the codepaths
easier to maintain. A lot of background can be found in [1].

In order to build a type safe posix api around get and set acl we need
all filesystem to implement get and set acl.

So far ecryptfs didn't implement get and set acl inode operations
because it wanted easy access to the dentry. Now that we extended the
set acl inode operation to take a dentry argument and added a new get
acl inode operation that takes a dentry argument we can let ecryptfs
implement get and set acl inode operations.

Note, until the vfs has been switched to the new posix acl api this
patch is a non-functional change.

Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1]
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/ecryptfs/inode.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'fs')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index b4be9f6bafd2..5802b93b2cda 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -1129,6 +1129,21 @@ static struct posix_acl *ecryptfs_get_acl(struct user_namespace *mnt_userns,
 			   posix_acl_xattr_name(type));
 }
 
+static int ecryptfs_set_acl(struct user_namespace *mnt_userns,
+			    struct dentry *dentry, struct posix_acl *acl,
+			    int type)
+{
+	int rc;
+	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	struct inode *lower_inode = d_inode(lower_dentry);
+
+	rc = vfs_set_acl(&init_user_ns, lower_dentry,
+			 posix_acl_xattr_name(type), acl);
+	if (!rc)
+		fsstack_copy_attr_all(d_inode(dentry), lower_inode);
+	return rc;
+}
+
 const struct inode_operations ecryptfs_symlink_iops = {
 	.get_link = ecryptfs_get_link,
 	.permission = ecryptfs_permission,
@@ -1153,6 +1168,7 @@ const struct inode_operations ecryptfs_dir_iops = {
 	.fileattr_get = ecryptfs_fileattr_get,
 	.fileattr_set = ecryptfs_fileattr_set,
 	.get_acl = ecryptfs_get_acl,
+	.set_acl = ecryptfs_set_acl,
 };
 
 const struct inode_operations ecryptfs_main_iops = {
@@ -1163,6 +1179,7 @@ const struct inode_operations ecryptfs_main_iops = {
 	.fileattr_get = ecryptfs_fileattr_get,
 	.fileattr_set = ecryptfs_fileattr_set,
 	.get_acl = ecryptfs_get_acl,
+	.set_acl = ecryptfs_set_acl,
 };
 
 static int ecryptfs_xattr_get(const struct xattr_handler *handler,
-- 
cgit 


From 6c0a8bfb84af8f3d1960e6905eace59d6488d270 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 22 Sep 2022 17:17:19 +0200
Subject: ovl: implement get acl method

The current way of setting and getting posix acls through the generic
xattr interface is error prone and type unsafe. The vfs needs to
interpret and fixup posix acls before storing or reporting it to
userspace. Various hacks exist to make this work. The code is hard to
understand and difficult to maintain in it's current form. Instead of
making this work by hacking posix acls through xattr handlers we are
building a dedicated posix acl api around the get and set inode
operations. This removes a lot of hackiness and makes the codepaths
easier to maintain. A lot of background can be found in [1].

In order to build a type safe posix api around get and set acl we need
all filesystem to implement get and set acl.

Now that we have added get and set acl inode operations that allow easy
access to the dentry we give overlayfs it's own get and set acl inode
operations.

Since overlayfs is a stacking filesystem it will use the newly added
posix acl api when retrieving posix acls from the relevant layer.

Since overlayfs can also be mounted on top of idmapped layers. If
idmapped layers are used overlayfs must take the layer's idmapping into
account after it retrieved the posix acls from the relevant layer.

Note, until the vfs has been switched to the new posix acl api this
patch is a non-functional change.

Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1]
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/overlayfs/dir.c       |   3 +-
 fs/overlayfs/inode.c     | 109 +++++++++++++++++++++++++++++++++--------------
 fs/overlayfs/overlayfs.h |  17 +++++++-
 3 files changed, 95 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 7bece7010c00..eb49d5d7b56f 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -1311,7 +1311,8 @@ const struct inode_operations ovl_dir_inode_operations = {
 	.permission	= ovl_permission,
 	.getattr	= ovl_getattr,
 	.listxattr	= ovl_listxattr,
-	.get_inode_acl	= ovl_get_acl,
+	.get_inode_acl	= ovl_get_inode_acl,
+	.get_acl	= ovl_get_acl,
 	.update_time	= ovl_update_time,
 	.fileattr_get	= ovl_fileattr_get,
 	.fileattr_set	= ovl_fileattr_set,
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 6eefd8b7868e..74edbd1c1159 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -14,6 +14,8 @@
 #include <linux/fileattr.h>
 #include <linux/security.h>
 #include <linux/namei.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
 #include "overlayfs.h"
 
 
@@ -460,7 +462,7 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
  * of the POSIX ACLs retrieved from the lower layer to this function to not
  * alter the POSIX ACLs for the underlying filesystem.
  */
-static void ovl_idmap_posix_acl(struct inode *realinode,
+static void ovl_idmap_posix_acl(const struct inode *realinode,
 				struct user_namespace *mnt_userns,
 				struct posix_acl *acl)
 {
@@ -484,6 +486,64 @@ static void ovl_idmap_posix_acl(struct inode *realinode,
 	}
 }
 
+/*
+ * The @noperm argument is used to skip permission checking and is a temporary
+ * measure. Quoting Miklos from an earlier discussion:
+ *
+ * > So there are two paths to getting an acl:
+ * > 1) permission checking and 2) retrieving the value via getxattr(2).
+ * > This is a similar situation as reading a symlink vs. following it.
+ * > When following a symlink overlayfs always reads the link on the
+ * > underlying fs just as if it was a readlink(2) call, calling
+ * > security_inode_readlink() instead of security_inode_follow_link().
+ * > This is logical: we are reading the link from the underlying storage,
+ * > and following it on overlayfs.
+ * >
+ * > Applying the same logic to acl: we do need to call the
+ * > security_inode_getxattr() on the underlying fs, even if just want to
+ * > check permissions on overlay. This is currently not done, which is an
+ * > inconsistency.
+ * >
+ * > Maybe adding the check to ovl_get_acl() is the right way to go, but
+ * > I'm a little afraid of a performance regression.  Will look into that.
+ *
+ * Until we have made a decision allow this helper to take the @noperm
+ * argument. We should hopefully be able to remove it soon.
+ */
+static struct posix_acl *ovl_get_acl_path(const struct path *path,
+					  const char *acl_name, bool noperm)
+{
+	struct posix_acl *real_acl, *clone;
+	struct user_namespace *mnt_userns;
+	struct inode *realinode = d_inode(path->dentry);
+
+	mnt_userns = mnt_user_ns(path->mnt);
+
+	if (noperm)
+		real_acl = get_inode_acl(realinode, posix_acl_type(acl_name));
+	else
+		real_acl = vfs_get_acl(mnt_userns, path->dentry, acl_name);
+	if (IS_ERR_OR_NULL(real_acl))
+		return real_acl;
+
+	if (!is_idmapped_mnt(path->mnt))
+		return real_acl;
+
+	/*
+        * We cannot alter the ACLs returned from the relevant layer as that
+        * would alter the cached values filesystem wide for the lower
+        * filesystem. Instead we can clone the ACLs and then apply the
+        * relevant idmapping of the layer.
+        */
+	clone = posix_acl_clone(real_acl, GFP_KERNEL);
+	posix_acl_release(real_acl); /* release original acl */
+	if (!clone)
+		return ERR_PTR(-ENOMEM);
+
+	ovl_idmap_posix_acl(realinode, mnt_userns, clone);
+	return clone;
+}
+
 /*
  * When the relevant layer is an idmapped mount we need to take the idmapping
  * of the layer into account and translate any ACL_{GROUP,USER} values
@@ -495,10 +555,12 @@ static void ovl_idmap_posix_acl(struct inode *realinode,
  *
  * This is obviously only relevant when idmapped layers are used.
  */
-struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu)
+struct posix_acl *do_ovl_get_acl(struct user_namespace *mnt_userns,
+				 struct inode *inode, int type,
+				 bool rcu, bool noperm)
 {
 	struct inode *realinode = ovl_inode_real(inode);
-	struct posix_acl *acl, *clone;
+	struct posix_acl *acl;
 	struct path realpath;
 
 	if (!IS_POSIXACL(realinode))
@@ -512,40 +574,23 @@ struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu)
 	}
 
 	if (rcu) {
+		/*
+		 * If the layer is idmapped drop out of RCU path walk
+		 * so we can clone the ACLs.
+		 */
+		if (is_idmapped_mnt(realpath.mnt))
+			return ERR_PTR(-ECHILD);
+
 		acl = get_cached_acl_rcu(realinode, type);
 	} else {
 		const struct cred *old_cred;
 
 		old_cred = ovl_override_creds(inode->i_sb);
-		acl = get_inode_acl(realinode, type);
+		acl = ovl_get_acl_path(&realpath, posix_acl_xattr_name(type), noperm);
 		revert_creds(old_cred);
 	}
-	/*
-	 * If there are no POSIX ACLs, or we encountered an error,
-	 * or the layer isn't idmapped we don't need to do anything.
-	 */
-	if (!is_idmapped_mnt(realpath.mnt) || IS_ERR_OR_NULL(acl))
-		return acl;
-
-	/*
-	 * We only get here if the layer is idmapped. So drop out of RCU path
-	 * walk so we can clone the ACLs. There's no need to release the ACLs
-	 * since get_cached_acl_rcu() doesn't take a reference on the ACLs.
-	 */
-	if (rcu)
-		return ERR_PTR(-ECHILD);
 
-	clone = posix_acl_clone(acl, GFP_KERNEL);
-	if (!clone)
-		clone = ERR_PTR(-ENOMEM);
-	else
-		ovl_idmap_posix_acl(realinode, mnt_user_ns(realpath.mnt), clone);
-	/*
-	 * Since we're not in RCU path walk we always need to release the
-	 * original ACLs.
-	 */
-	posix_acl_release(acl);
-	return clone;
+	return acl;
 }
 #endif
 
@@ -721,7 +766,8 @@ static const struct inode_operations ovl_file_inode_operations = {
 	.permission	= ovl_permission,
 	.getattr	= ovl_getattr,
 	.listxattr	= ovl_listxattr,
-	.get_inode_acl	= ovl_get_acl,
+	.get_inode_acl	= ovl_get_inode_acl,
+	.get_acl	= ovl_get_acl,
 	.update_time	= ovl_update_time,
 	.fiemap		= ovl_fiemap,
 	.fileattr_get	= ovl_fileattr_get,
@@ -741,7 +787,8 @@ static const struct inode_operations ovl_special_inode_operations = {
 	.permission	= ovl_permission,
 	.getattr	= ovl_getattr,
 	.listxattr	= ovl_listxattr,
-	.get_inode_acl	= ovl_get_acl,
+	.get_inode_acl	= ovl_get_inode_acl,
+	.get_acl	= ovl_get_acl,
 	.update_time	= ovl_update_time,
 };
 
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index eee8f08d32b6..d334977030c8 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -594,9 +594,22 @@ int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
 ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
 
 #ifdef CONFIG_FS_POSIX_ACL
-struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu);
+struct posix_acl *do_ovl_get_acl(struct user_namespace *mnt_userns,
+				 struct inode *inode, int type,
+				 bool rcu, bool noperm);
+static inline struct posix_acl *ovl_get_inode_acl(struct inode *inode, int type,
+						  bool rcu)
+{
+	return do_ovl_get_acl(&init_user_ns, inode, type, rcu, true);
+}
+static inline struct posix_acl *ovl_get_acl(struct user_namespace *mnt_userns,
+					    struct dentry *dentry, int type)
+{
+	return do_ovl_get_acl(mnt_userns, d_inode(dentry), type, false, false);
+}
 #else
-#define ovl_get_acl	NULL
+#define ovl_get_inode_acl	NULL
+#define ovl_get_acl		NULL
 #endif
 
 int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags);
-- 
cgit 


From 0e641857322f4d0a99aec3f49b21b732f3bb5ef9 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 22 Sep 2022 17:17:20 +0200
Subject: ovl: implement set acl method

The current way of setting and getting posix acls through the generic
xattr interface is error prone and type unsafe. The vfs needs to
interpret and fixup posix acls before storing or reporting it to
userspace. Various hacks exist to make this work. The code is hard to
understand and difficult to maintain in it's current form. Instead of
making this work by hacking posix acls through xattr handlers we are
building a dedicated posix acl api around the get and set inode
operations. This removes a lot of hackiness and makes the codepaths
easier to maintain. A lot of background can be found in [1].

In order to build a type safe posix api around get and set acl we need
all filesystem to implement get and set acl.

Now that we have added get and set acl inode operations that allow easy
access to the dentry we give overlayfs it's own get and set acl inode
operations.

The set acl inode operation is duplicates most of the ovl posix acl
xattr handler. The main difference being that the set acl inode
operation relies on the new posix acl api. Once the vfs has been
switched over the custom posix acl xattr handler will be removed
completely.

Note, until the vfs has been switched to the new posix acl api this
patch is a non-functional change.

Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1]
Acked-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/overlayfs/dir.c       |  1 +
 fs/overlayfs/inode.c     | 94 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/overlayfs/overlayfs.h | 17 +++++++++
 3 files changed, 112 insertions(+)

(limited to 'fs')

diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index eb49d5d7b56f..0e817ebce92c 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -1313,6 +1313,7 @@ const struct inode_operations ovl_dir_inode_operations = {
 	.listxattr	= ovl_listxattr,
 	.get_inode_acl	= ovl_get_inode_acl,
 	.get_acl	= ovl_get_acl,
+	.set_acl	= ovl_set_acl,
 	.update_time	= ovl_update_time,
 	.fileattr_get	= ovl_fileattr_get,
 	.fileattr_set	= ovl_fileattr_set,
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 74edbd1c1159..304a6dbb852a 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -592,6 +592,98 @@ struct posix_acl *do_ovl_get_acl(struct user_namespace *mnt_userns,
 
 	return acl;
 }
+
+static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode,
+				 struct posix_acl *acl, int type)
+{
+	int err;
+	struct path realpath;
+	const char *acl_name;
+	const struct cred *old_cred;
+	struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+	struct dentry *upperdentry = ovl_dentry_upper(dentry);
+	struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry);
+
+	err = ovl_want_write(dentry);
+	if (err)
+		return err;
+
+	/*
+	 * If ACL is to be removed from a lower file, check if it exists in
+	 * the first place before copying it up.
+	 */
+	acl_name = posix_acl_xattr_name(type);
+	if (!acl && !upperdentry) {
+		struct posix_acl *real_acl;
+
+		ovl_path_lower(dentry, &realpath);
+		old_cred = ovl_override_creds(dentry->d_sb);
+		real_acl = vfs_get_acl(mnt_user_ns(realpath.mnt), realdentry,
+				       acl_name);
+		revert_creds(old_cred);
+		posix_acl_release(real_acl);
+		if (IS_ERR(real_acl)) {
+			err = PTR_ERR(real_acl);
+			goto out_drop_write;
+		}
+	}
+
+	if (!upperdentry) {
+		err = ovl_copy_up(dentry);
+		if (err)
+			goto out_drop_write;
+
+		realdentry = ovl_dentry_upper(dentry);
+	}
+
+	old_cred = ovl_override_creds(dentry->d_sb);
+	if (acl)
+		err = ovl_do_set_acl(ofs, realdentry, acl_name, acl);
+	else
+		err = ovl_do_remove_acl(ofs, realdentry, acl_name);
+	revert_creds(old_cred);
+
+	/* copy c/mtime */
+	ovl_copyattr(inode);
+
+out_drop_write:
+	ovl_drop_write(dentry);
+	return err;
+}
+
+int ovl_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+		struct posix_acl *acl, int type)
+{
+	int err;
+	struct inode *inode = d_inode(dentry);
+	struct dentry *workdir = ovl_workdir(dentry);
+	struct inode *realinode = ovl_inode_real(inode);
+
+	if (!IS_POSIXACL(d_inode(workdir)))
+		return -EOPNOTSUPP;
+	if (!realinode->i_op->set_acl)
+		return -EOPNOTSUPP;
+	if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
+		return acl ? -EACCES : 0;
+	if (!inode_owner_or_capable(&init_user_ns, inode))
+		return -EPERM;
+
+	/*
+	 * Check if sgid bit needs to be cleared (actual setacl operation will
+	 * be done with mounter's capabilities and so that won't do it for us).
+	 */
+	if (unlikely(inode->i_mode & S_ISGID) && type == ACL_TYPE_ACCESS &&
+	    !in_group_p(inode->i_gid) &&
+	    !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) {
+		struct iattr iattr = { .ia_valid = ATTR_KILL_SGID };
+
+		err = ovl_setattr(&init_user_ns, dentry, &iattr);
+		if (err)
+			return err;
+	}
+
+	return ovl_set_or_remove_acl(dentry, inode, acl, type);
+}
 #endif
 
 int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags)
@@ -768,6 +860,7 @@ static const struct inode_operations ovl_file_inode_operations = {
 	.listxattr	= ovl_listxattr,
 	.get_inode_acl	= ovl_get_inode_acl,
 	.get_acl	= ovl_get_acl,
+	.set_acl	= ovl_set_acl,
 	.update_time	= ovl_update_time,
 	.fiemap		= ovl_fiemap,
 	.fileattr_get	= ovl_fileattr_get,
@@ -789,6 +882,7 @@ static const struct inode_operations ovl_special_inode_operations = {
 	.listxattr	= ovl_listxattr,
 	.get_inode_acl	= ovl_get_inode_acl,
 	.get_acl	= ovl_get_acl,
+	.set_acl	= ovl_set_acl,
 	.update_time	= ovl_update_time,
 };
 
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index d334977030c8..ab5061c9aa2a 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -8,6 +8,8 @@
 #include <linux/uuid.h>
 #include <linux/fs.h>
 #include <linux/namei.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
 #include "ovl_entry.h"
 
 #undef pr_fmt
@@ -278,6 +280,18 @@ static inline int ovl_removexattr(struct ovl_fs *ofs, struct dentry *dentry,
 	return ovl_do_removexattr(ofs, dentry, ovl_xattr(ofs, ox));
 }
 
+static inline int ovl_do_set_acl(struct ovl_fs *ofs, struct dentry *dentry,
+				 const char *acl_name, struct posix_acl *acl)
+{
+	return vfs_set_acl(ovl_upper_mnt_userns(ofs), dentry, acl_name, acl);
+}
+
+static inline int ovl_do_remove_acl(struct ovl_fs *ofs, struct dentry *dentry,
+				    const char *acl_name)
+{
+	return vfs_remove_acl(ovl_upper_mnt_userns(ofs), dentry, acl_name);
+}
+
 static inline int ovl_do_rename(struct ovl_fs *ofs, struct inode *olddir,
 				struct dentry *olddentry, struct inode *newdir,
 				struct dentry *newdentry, unsigned int flags)
@@ -607,9 +621,12 @@ static inline struct posix_acl *ovl_get_acl(struct user_namespace *mnt_userns,
 {
 	return do_ovl_get_acl(mnt_userns, d_inode(dentry), type, false, false);
 }
+int ovl_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+		struct posix_acl *acl, int type);
 #else
 #define ovl_get_inode_acl	NULL
 #define ovl_get_acl		NULL
+#define ovl_set_acl		NULL
 #endif
 
 int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags);
-- 
cgit 


From 31acceb97500dd6e9105526301d76488cd6ca21c Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 22 Sep 2022 17:17:21 +0200
Subject: ovl: use posix acl api

Now that posix acls have a proper api us it to copy them.

All filesystems that can serve as lower or upper layers for overlayfs
have gained support for the new posix acl api in previous patches.
So switch all internal overlayfs codepaths for copying posix acls to the
new posix acl api.

Acked-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/overlayfs/copy_up.c   | 38 ++++++++++++++++++++++++++++++++++++++
 fs/overlayfs/dir.c       | 20 ++------------------
 fs/overlayfs/inode.c     |  4 ++--
 fs/overlayfs/overlayfs.h |  8 ++++++++
 fs/overlayfs/super.c     |  6 ++----
 fs/xattr.c               |  6 ------
 6 files changed, 52 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index f436d8847f08..6e4e65ee050d 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -44,6 +44,35 @@ static bool ovl_must_copy_xattr(const char *name)
 	       !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN);
 }
 
+static int ovl_copy_acl(struct ovl_fs *ofs, const struct path *path,
+			struct dentry *dentry, const char *acl_name)
+{
+	int err;
+	struct posix_acl *clone, *real_acl = NULL;
+
+	real_acl = ovl_get_acl_path(path, acl_name, false);
+	if (!real_acl)
+		return 0;
+
+	if (IS_ERR(real_acl)) {
+		err = PTR_ERR(real_acl);
+		if (err == -ENODATA || err == -EOPNOTSUPP)
+			return 0;
+		return err;
+	}
+
+	clone = posix_acl_clone(real_acl, GFP_KERNEL);
+	posix_acl_release(real_acl); /* release original acl */
+	if (!clone)
+		return -ENOMEM;
+
+	err = ovl_do_set_acl(ofs, dentry, acl_name, clone);
+
+	/* release cloned acl */
+	posix_acl_release(clone);
+	return err;
+}
+
 int ovl_copy_xattr(struct super_block *sb, const struct path *oldpath, struct dentry *new)
 {
 	struct dentry *old = oldpath->dentry;
@@ -93,6 +122,15 @@ int ovl_copy_xattr(struct super_block *sb, const struct path *oldpath, struct de
 			error = 0;
 			continue; /* Discard */
 		}
+
+		if (is_posix_acl_xattr(name)) {
+			error = ovl_copy_acl(OVL_FS(sb), oldpath, new, name);
+			if (!error)
+				continue;
+			/* POSIX ACLs must be copied. */
+			break;
+		}
+
 retry:
 		size = ovl_do_getxattr(oldpath, name, value, value_size);
 		if (size == -ERANGE)
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 0e817ebce92c..cbb569d5d234 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -435,28 +435,12 @@ out:
 }
 
 static int ovl_set_upper_acl(struct ovl_fs *ofs, struct dentry *upperdentry,
-			     const char *name, const struct posix_acl *acl)
+			     const char *acl_name, struct posix_acl *acl)
 {
-	void *buffer;
-	size_t size;
-	int err;
-
 	if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !acl)
 		return 0;
 
-	size = posix_acl_xattr_size(acl->a_count);
-	buffer = kmalloc(size, GFP_KERNEL);
-	if (!buffer)
-		return -ENOMEM;
-
-	err = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
-	if (err < 0)
-		goto out_free;
-
-	err = ovl_do_setxattr(ofs, upperdentry, name, buffer, size, XATTR_CREATE);
-out_free:
-	kfree(buffer);
-	return err;
+	return ovl_do_set_acl(ofs, upperdentry, acl_name, acl);
 }
 
 static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 304a6dbb852a..77a77fd7a77b 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -510,8 +510,8 @@ static void ovl_idmap_posix_acl(const struct inode *realinode,
  * Until we have made a decision allow this helper to take the @noperm
  * argument. We should hopefully be able to remove it soon.
  */
-static struct posix_acl *ovl_get_acl_path(const struct path *path,
-					  const char *acl_name, bool noperm)
+struct posix_acl *ovl_get_acl_path(const struct path *path,
+				   const char *acl_name, bool noperm)
 {
 	struct posix_acl *real_acl, *clone;
 	struct user_namespace *mnt_userns;
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index ab5061c9aa2a..480e6aabef27 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -623,10 +623,18 @@ static inline struct posix_acl *ovl_get_acl(struct user_namespace *mnt_userns,
 }
 int ovl_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 		struct posix_acl *acl, int type);
+struct posix_acl *ovl_get_acl_path(const struct path *path,
+				   const char *acl_name, bool noperm);
 #else
 #define ovl_get_inode_acl	NULL
 #define ovl_get_acl		NULL
 #define ovl_set_acl		NULL
+static inline struct posix_acl *ovl_get_acl_path(const struct path *path,
+						 const char *acl_name,
+						 bool noperm)
+{
+	return NULL;
+}
 #endif
 
 int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index a29a8afe9b26..5c1b7971a9b3 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -813,13 +813,11 @@ retry:
 		 * allowed as upper are limited to "normal" ones, where checking
 		 * for the above two errors is sufficient.
 		 */
-		err = ovl_do_removexattr(ofs, work,
-					 XATTR_NAME_POSIX_ACL_DEFAULT);
+		err = ovl_do_remove_acl(ofs, work, XATTR_NAME_POSIX_ACL_DEFAULT);
 		if (err && err != -ENODATA && err != -EOPNOTSUPP)
 			goto out_dput;
 
-		err = ovl_do_removexattr(ofs, work,
-					 XATTR_NAME_POSIX_ACL_ACCESS);
+		err = ovl_do_remove_acl(ofs, work, XATTR_NAME_POSIX_ACL_ACCESS);
 		if (err && err != -ENODATA && err != -EOPNOTSUPP)
 			goto out_dput;
 
diff --git a/fs/xattr.c b/fs/xattr.c
index 31b5ac65ca34..9ed9eea4d1b9 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -299,12 +299,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(__vfs_setxattr_locked);
 
-static inline bool is_posix_acl_xattr(const char *name)
-{
-	return (strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
-	       (strcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0);
-}
-
 int
 vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 	     const char *name, const void *value, size_t size, int flags)
-- 
cgit 


From 318e66856ddec05384f32d60b5598128289f4e7b Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 22 Sep 2022 17:17:22 +0200
Subject: xattr: use posix acl api

In previous patches we built a new posix api solely around get and set
inode operations. Now that we have all the pieces in place we can switch
the system calls and the vfs over to only rely on this api when
interacting with posix acls. This finally removes all type unsafety and
type conversion issues explained in detail in [1] that we aim to get rid
of.

With the new posix acl api we immediately translate into an appropriate
kernel internal struct posix_acl format both when getting and setting
posix acls. This is a stark contrast to before were we hacked unsafe raw
values into the uapi struct that was stored in a void pointer relying
and having filesystems and security modules hack around in the uapi
struct as well.

Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1]
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/internal.h  | 20 ++++++++++++++++++++
 fs/posix_acl.c | 37 +++++++++++++++++++++++++++++++++++++
 fs/xattr.c     | 31 ++++++++++++++++++++-----------
 3 files changed, 77 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/internal.h b/fs/internal.h
index aa5f240496d9..e377eb7bbe7f 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -234,4 +234,24 @@ int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 		struct xattr_ctx *ctx);
 int may_write_xattr(struct user_namespace *mnt_userns, struct inode *inode);
 
+#ifdef CONFIG_FS_POSIX_ACL
+int do_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+	       const char *acl_name, const void *kvalue, size_t size);
+ssize_t do_get_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+		   const char *acl_name, void *kvalue, size_t size);
+#else
+static inline int do_set_acl(struct user_namespace *mnt_userns,
+			     struct dentry *dentry, const char *acl_name,
+			     const void *kvalue, size_t size)
+{
+	return -EOPNOTSUPP;
+}
+static inline ssize_t do_get_acl(struct user_namespace *mnt_userns,
+				 struct dentry *dentry, const char *acl_name,
+				 void *kvalue, size_t size)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
 ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos);
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 3a141082b24c..14b87653b797 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -1544,3 +1544,40 @@ out_inode_unlock:
 	return error;
 }
 EXPORT_SYMBOL_GPL(vfs_remove_acl);
+
+int do_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+	       const char *acl_name, const void *kvalue, size_t size)
+{
+	int error;
+	struct posix_acl *acl = NULL;
+
+	if (size) {
+		/*
+		 * Note that posix_acl_from_xattr() uses GFP_NOFS when it
+		 * probably doesn't need to here.
+		 */
+		acl = posix_acl_from_xattr(current_user_ns(), kvalue, size);
+		if (IS_ERR(acl))
+			return PTR_ERR(acl);
+	}
+
+	error = vfs_set_acl(mnt_userns, dentry, acl_name, acl);
+	posix_acl_release(acl);
+	return error;
+}
+
+ssize_t do_get_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+		   const char *acl_name, void *kvalue, size_t size)
+{
+	ssize_t error;
+	struct posix_acl *acl;
+
+	acl = vfs_get_acl(mnt_userns, dentry, acl_name);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+
+	error = vfs_posix_acl_to_xattr(mnt_userns, d_inode(dentry),
+				       acl, kvalue, size);
+	posix_acl_release(acl);
+	return error;
+}
diff --git a/fs/xattr.c b/fs/xattr.c
index 9ed9eea4d1b9..77db5aa26d13 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -189,6 +189,9 @@ __vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 {
 	const struct xattr_handler *handler;
 
+	if (is_posix_acl_xattr(name))
+		return -EOPNOTSUPP;
+
 	handler = xattr_resolve_name(inode, &name);
 	if (IS_ERR(handler))
 		return PTR_ERR(handler);
@@ -410,6 +413,9 @@ __vfs_getxattr(struct dentry *dentry, struct inode *inode, const char *name,
 {
 	const struct xattr_handler *handler;
 
+	if (is_posix_acl_xattr(name))
+		return -EOPNOTSUPP;
+
 	handler = xattr_resolve_name(inode, &name);
 	if (IS_ERR(handler))
 		return PTR_ERR(handler);
@@ -482,6 +488,9 @@ __vfs_removexattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 	struct inode *inode = d_inode(dentry);
 	const struct xattr_handler *handler;
 
+	if (is_posix_acl_xattr(name))
+		return -EOPNOTSUPP;
+
 	handler = xattr_resolve_name(inode, &name);
 	if (IS_ERR(handler))
 		return PTR_ERR(handler);
@@ -591,17 +600,13 @@ int setxattr_copy(const char __user *name, struct xattr_ctx *ctx)
 	return error;
 }
 
-static void setxattr_convert(struct user_namespace *mnt_userns,
-			     struct dentry *d, struct xattr_ctx *ctx)
-{
-	if (ctx->size && is_posix_acl_xattr(ctx->kname->name))
-		posix_acl_fix_xattr_from_user(ctx->kvalue, ctx->size);
-}
-
 int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 		struct xattr_ctx *ctx)
 {
-	setxattr_convert(mnt_userns, dentry, ctx);
+	if (is_posix_acl_xattr(ctx->kname->name))
+		return do_set_acl(mnt_userns, dentry, ctx->kname->name,
+				  ctx->kvalue, ctx->size);
+
 	return vfs_setxattr(mnt_userns, dentry, ctx->kname->name,
 			ctx->kvalue, ctx->size, ctx->flags);
 }
@@ -708,10 +713,11 @@ do_getxattr(struct user_namespace *mnt_userns, struct dentry *d,
 			return -ENOMEM;
 	}
 
-	error = vfs_getxattr(mnt_userns, d, kname, ctx->kvalue, ctx->size);
+	if (is_posix_acl_xattr(ctx->kname->name))
+		error = do_get_acl(mnt_userns, d, kname, ctx->kvalue, ctx->size);
+	else
+		error = vfs_getxattr(mnt_userns, d, kname, ctx->kvalue, ctx->size);
 	if (error > 0) {
-		if (is_posix_acl_xattr(kname))
-			posix_acl_fix_xattr_to_user(ctx->kvalue, error);
 		if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error))
 			error = -EFAULT;
 	} else if (error == -ERANGE && ctx->size >= XATTR_SIZE_MAX) {
@@ -886,6 +892,9 @@ removexattr(struct user_namespace *mnt_userns, struct dentry *d,
 	if (error < 0)
 		return error;
 
+	if (is_posix_acl_xattr(kname))
+		return vfs_remove_acl(mnt_userns, d, kname);
+
 	return vfs_removexattr(mnt_userns, d, kname);
 }
 
-- 
cgit 


From 04af28faae6760514f0302aa9db561e60ee53a20 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 22 Sep 2022 17:17:23 +0200
Subject: ecryptfs: use stub posix acl handlers

Now that ecryptfs supports the get and set acl inode operations and the
vfs has been switched to the new posi api, ecryptfs can simply rely on
the stub posix acl handlers.

Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/ecryptfs/inode.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 5802b93b2cda..f3cd00fac9c3 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -1210,6 +1210,10 @@ static const struct xattr_handler ecryptfs_xattr_handler = {
 };
 
 const struct xattr_handler *ecryptfs_xattr_handlers[] = {
+#ifdef CONFIG_FS_POSIX_ACL
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
+#endif
 	&ecryptfs_xattr_handler,
 	NULL
 };
-- 
cgit 


From 200afb77cde7e51364d4a8b3e176f4797a3b5ec7 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 22 Sep 2022 17:17:24 +0200
Subject: ovl: use stub posix acl handlers

Now that ovl supports the get and set acl inode operations and the vfs
has been switched to the new posi api, ovl can simply rely on the stub
posix acl handlers. The custom xattr handlers and associated unused
helpers can be removed.

Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/overlayfs/super.c | 101 ++-------------------------------------------------
 1 file changed, 4 insertions(+), 97 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 5c1b7971a9b3..2addafe4e14a 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -999,83 +999,6 @@ static unsigned int ovl_split_lowerdirs(char *str)
 	return ctr;
 }
 
-static int __maybe_unused
-ovl_posix_acl_xattr_get(const struct xattr_handler *handler,
-			struct dentry *dentry, struct inode *inode,
-			const char *name, void *buffer, size_t size)
-{
-	return ovl_xattr_get(dentry, inode, handler->name, buffer, size);
-}
-
-static int __maybe_unused
-ovl_posix_acl_xattr_set(const struct xattr_handler *handler,
-			struct user_namespace *mnt_userns,
-			struct dentry *dentry, struct inode *inode,
-			const char *name, const void *value,
-			size_t size, int flags)
-{
-	struct dentry *workdir = ovl_workdir(dentry);
-	struct inode *realinode = ovl_inode_real(inode);
-	struct posix_acl *acl = NULL;
-	int err;
-
-	/* Check that everything is OK before copy-up */
-	if (value) {
-		/* The above comment can be understood in two ways:
-		 *
-		 * 1. We just want to check whether the basic POSIX ACL format
-		 *    is ok. For example, if the header is correct and the size
-		 *    is sane.
-		 * 2. We want to know whether the ACL_{GROUP,USER} entries can
-		 *    be mapped according to the underlying filesystem.
-		 *
-		 * Currently, we only check 1. If we wanted to check 2. we
-		 * would need to pass the mnt_userns and the fs_userns of the
-		 * underlying filesystem. But frankly, I think checking 1. is
-		 * enough to start the copy-up.
-		 */
-		acl = vfs_set_acl_prepare(&init_user_ns, &init_user_ns, value, size);
-		if (IS_ERR(acl))
-			return PTR_ERR(acl);
-	}
-	err = -EOPNOTSUPP;
-	if (!IS_POSIXACL(d_inode(workdir)))
-		goto out_acl_release;
-	if (!realinode->i_op->set_acl)
-		goto out_acl_release;
-	if (handler->flags == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) {
-		err = acl ? -EACCES : 0;
-		goto out_acl_release;
-	}
-	err = -EPERM;
-	if (!inode_owner_or_capable(&init_user_ns, inode))
-		goto out_acl_release;
-
-	posix_acl_release(acl);
-
-	/*
-	 * Check if sgid bit needs to be cleared (actual setacl operation will
-	 * be done with mounter's capabilities and so that won't do it for us).
-	 */
-	if (unlikely(inode->i_mode & S_ISGID) &&
-	    handler->flags == ACL_TYPE_ACCESS &&
-	    !in_group_p(inode->i_gid) &&
-	    !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) {
-		struct iattr iattr = { .ia_valid = ATTR_KILL_SGID };
-
-		err = ovl_setattr(&init_user_ns, dentry, &iattr);
-		if (err)
-			return err;
-	}
-
-	err = ovl_xattr_set(dentry, inode, handler->name, value, size, flags);
-	return err;
-
-out_acl_release:
-	posix_acl_release(acl);
-	return err;
-}
-
 static int ovl_own_xattr_get(const struct xattr_handler *handler,
 			     struct dentry *dentry, struct inode *inode,
 			     const char *name, void *buffer, size_t size)
@@ -1108,22 +1031,6 @@ static int ovl_other_xattr_set(const struct xattr_handler *handler,
 	return ovl_xattr_set(dentry, inode, name, value, size, flags);
 }
 
-static const struct xattr_handler __maybe_unused
-ovl_posix_acl_access_xattr_handler = {
-	.name = XATTR_NAME_POSIX_ACL_ACCESS,
-	.flags = ACL_TYPE_ACCESS,
-	.get = ovl_posix_acl_xattr_get,
-	.set = ovl_posix_acl_xattr_set,
-};
-
-static const struct xattr_handler __maybe_unused
-ovl_posix_acl_default_xattr_handler = {
-	.name = XATTR_NAME_POSIX_ACL_DEFAULT,
-	.flags = ACL_TYPE_DEFAULT,
-	.get = ovl_posix_acl_xattr_get,
-	.set = ovl_posix_acl_xattr_set,
-};
-
 static const struct xattr_handler ovl_own_trusted_xattr_handler = {
 	.prefix	= OVL_XATTR_TRUSTED_PREFIX,
 	.get = ovl_own_xattr_get,
@@ -1144,8 +1051,8 @@ static const struct xattr_handler ovl_other_xattr_handler = {
 
 static const struct xattr_handler *ovl_trusted_xattr_handlers[] = {
 #ifdef CONFIG_FS_POSIX_ACL
-	&ovl_posix_acl_access_xattr_handler,
-	&ovl_posix_acl_default_xattr_handler,
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
 #endif
 	&ovl_own_trusted_xattr_handler,
 	&ovl_other_xattr_handler,
@@ -1154,8 +1061,8 @@ static const struct xattr_handler *ovl_trusted_xattr_handlers[] = {
 
 static const struct xattr_handler *ovl_user_xattr_handlers[] = {
 #ifdef CONFIG_FS_POSIX_ACL
-	&ovl_posix_acl_access_xattr_handler,
-	&ovl_posix_acl_default_xattr_handler,
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
 #endif
 	&ovl_own_user_xattr_handler,
 	&ovl_other_xattr_handler,
-- 
cgit 


From c39c07fce78439dfb0665b294753716297389179 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 22 Sep 2022 17:17:25 +0200
Subject: cifs: use stub posix acl handlers

Now that cifs supports the get and set acl inode operations and the vfs
has been switched to the new posi api, cifs can simply rely on the stub
posix acl handlers. The custom xattr handlers and associated unused
helpers can be removed.

Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/cifs/cifsproto.h |   8 --
 fs/cifs/cifssmb.c   | 298 ----------------------------------------------------
 fs/cifs/xattr.c     |  68 +-----------
 3 files changed, 4 insertions(+), 370 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index ce7f08a387b5..f50f96e4ec30 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -541,18 +541,10 @@ extern int CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon,
 			__u16 fid, struct cifs_ntsd **acl_inf, __u32 *buflen);
 extern int CIFSSMBSetCIFSACL(const unsigned int, struct cifs_tcon *, __u16,
 			struct cifs_ntsd *, __u32, int);
-extern int CIFSSMBGetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
-		const unsigned char *searchName,
-		char *acl_inf, const int buflen, const int acl_type,
-		const struct nls_table *nls_codepage, int remap_special_chars);
 extern int cifs_do_get_acl(const unsigned int xid, struct cifs_tcon *tcon,
 			   const unsigned char *searchName,
 			   struct posix_acl **acl, const int acl_type,
 			   const struct nls_table *nls_codepage, int remap);
-extern int CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
-		const unsigned char *fileName,
-		const char *local_acl, const int buflen, const int acl_type,
-		const struct nls_table *nls_codepage, int remap_special_chars);
 extern int cifs_do_set_acl(const unsigned int xid, struct cifs_tcon *tcon,
 			   const unsigned char *fileName,
 			   const struct posix_acl *acl, const int acl_type,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index f119ca917947..23f10e0d6e7e 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -2914,304 +2914,6 @@ CIFSSMB_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
 
 #ifdef CONFIG_CIFS_POSIX
 
-/*Convert an Access Control Entry from wire format to local POSIX xattr format*/
-static void cifs_convert_ace(struct posix_acl_xattr_entry *ace,
-			     struct cifs_posix_ace *cifs_ace)
-{
-	/* u8 cifs fields do not need le conversion */
-	ace->e_perm = cpu_to_le16(cifs_ace->cifs_e_perm);
-	ace->e_tag  = cpu_to_le16(cifs_ace->cifs_e_tag);
-	ace->e_id   = cpu_to_le32(le64_to_cpu(cifs_ace->cifs_uid));
-/*
-	cifs_dbg(FYI, "perm %d tag %d id %d\n",
-		 ace->e_perm, ace->e_tag, ace->e_id);
-*/
-
-	return;
-}
-
-/* Convert ACL from CIFS POSIX wire format to local Linux POSIX ACL xattr */
-static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen,
-			       const int acl_type, const int size_of_data_area)
-{
-	int size =  0;
-	int i;
-	__u16 count;
-	struct cifs_posix_ace *pACE;
-	struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)src;
-	struct posix_acl_xattr_header *local_acl = (void *)trgt;
-
-	if (le16_to_cpu(cifs_acl->version) != CIFS_ACL_VERSION)
-		return -EOPNOTSUPP;
-
-	if (acl_type == ACL_TYPE_ACCESS) {
-		count = le16_to_cpu(cifs_acl->access_entry_count);
-		pACE = &cifs_acl->ace_array[0];
-		size = sizeof(struct cifs_posix_acl);
-		size += sizeof(struct cifs_posix_ace) * count;
-		/* check if we would go beyond end of SMB */
-		if (size_of_data_area < size) {
-			cifs_dbg(FYI, "bad CIFS POSIX ACL size %d vs. %d\n",
-				 size_of_data_area, size);
-			return -EINVAL;
-		}
-	} else if (acl_type == ACL_TYPE_DEFAULT) {
-		count = le16_to_cpu(cifs_acl->access_entry_count);
-		size = sizeof(struct cifs_posix_acl);
-		size += sizeof(struct cifs_posix_ace) * count;
-/* skip past access ACEs to get to default ACEs */
-		pACE = &cifs_acl->ace_array[count];
-		count = le16_to_cpu(cifs_acl->default_entry_count);
-		size += sizeof(struct cifs_posix_ace) * count;
-		/* check if we would go beyond end of SMB */
-		if (size_of_data_area < size)
-			return -EINVAL;
-	} else {
-		/* illegal type */
-		return -EINVAL;
-	}
-
-	size = posix_acl_xattr_size(count);
-	if ((buflen == 0) || (local_acl == NULL)) {
-		/* used to query ACL EA size */
-	} else if (size > buflen) {
-		return -ERANGE;
-	} else /* buffer big enough */ {
-		struct posix_acl_xattr_entry *ace = (void *)(local_acl + 1);
-
-		local_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
-		for (i = 0; i < count ; i++) {
-			cifs_convert_ace(&ace[i], pACE);
-			pACE++;
-		}
-	}
-	return size;
-}
-
-static void convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace,
-				     const struct posix_acl_xattr_entry *local_ace)
-{
-	cifs_ace->cifs_e_perm = le16_to_cpu(local_ace->e_perm);
-	cifs_ace->cifs_e_tag =  le16_to_cpu(local_ace->e_tag);
-	/* BB is there a better way to handle the large uid? */
-	if (local_ace->e_id == cpu_to_le32(-1)) {
-	/* Probably no need to le convert -1 on any arch but can not hurt */
-		cifs_ace->cifs_uid = cpu_to_le64(-1);
-	} else
-		cifs_ace->cifs_uid = cpu_to_le64(le32_to_cpu(local_ace->e_id));
-/*
-	cifs_dbg(FYI, "perm %d tag %d id %d\n",
-		 ace->e_perm, ace->e_tag, ace->e_id);
-*/
-}
-
-/* Convert ACL from local Linux POSIX xattr to CIFS POSIX ACL wire format */
-static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
-			       const int buflen, const int acl_type)
-{
-	__u16 rc = 0;
-	struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)parm_data;
-	struct posix_acl_xattr_header *local_acl = (void *)pACL;
-	struct posix_acl_xattr_entry *ace = (void *)(local_acl + 1);
-	int count;
-	int i;
-
-	if ((buflen == 0) || (pACL == NULL) || (cifs_acl == NULL))
-		return 0;
-
-	count = posix_acl_xattr_count((size_t)buflen);
-	cifs_dbg(FYI, "setting acl with %d entries from buf of length %d and version of %d\n",
-		 count, buflen, le32_to_cpu(local_acl->a_version));
-	if (le32_to_cpu(local_acl->a_version) != 2) {
-		cifs_dbg(FYI, "unknown POSIX ACL version %d\n",
-			 le32_to_cpu(local_acl->a_version));
-		return 0;
-	}
-	cifs_acl->version = cpu_to_le16(1);
-	if (acl_type == ACL_TYPE_ACCESS) {
-		cifs_acl->access_entry_count = cpu_to_le16(count);
-		cifs_acl->default_entry_count = cpu_to_le16(0xFFFF);
-	} else if (acl_type == ACL_TYPE_DEFAULT) {
-		cifs_acl->default_entry_count = cpu_to_le16(count);
-		cifs_acl->access_entry_count = cpu_to_le16(0xFFFF);
-	} else {
-		cifs_dbg(FYI, "unknown ACL type %d\n", acl_type);
-		return 0;
-	}
-	for (i = 0; i < count; i++)
-		convert_ace_to_cifs_ace(&cifs_acl->ace_array[i], &ace[i]);
-	if (rc == 0) {
-		rc = (__u16)(count * sizeof(struct cifs_posix_ace));
-		rc += sizeof(struct cifs_posix_acl);
-		/* BB add check to make sure ACL does not overflow SMB */
-	}
-	return rc;
-}
-
-int
-CIFSSMBGetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
-		   const unsigned char *searchName,
-		   char *acl_inf, const int buflen, const int acl_type,
-		   const struct nls_table *nls_codepage, int remap)
-{
-/* SMB_QUERY_POSIX_ACL */
-	TRANSACTION2_QPI_REQ *pSMB = NULL;
-	TRANSACTION2_QPI_RSP *pSMBr = NULL;
-	int rc = 0;
-	int bytes_returned;
-	int name_len;
-	__u16 params, byte_count;
-
-	cifs_dbg(FYI, "In GetPosixACL (Unix) for path %s\n", searchName);
-
-queryAclRetry:
-	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
-		(void **) &pSMBr);
-	if (rc)
-		return rc;
-
-	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
-		name_len =
-			cifsConvertToUTF16((__le16 *) pSMB->FileName,
-					   searchName, PATH_MAX, nls_codepage,
-					   remap);
-		name_len++;     /* trailing null */
-		name_len *= 2;
-		pSMB->FileName[name_len] = 0;
-		pSMB->FileName[name_len+1] = 0;
-	} else {
-		name_len = copy_path_name(pSMB->FileName, searchName);
-	}
-
-	params = 2 /* level */  + 4 /* rsrvd */  + name_len /* incl null */ ;
-	pSMB->TotalDataCount = 0;
-	pSMB->MaxParameterCount = cpu_to_le16(2);
-	/* BB find exact max data count below from sess structure BB */
-	pSMB->MaxDataCount = cpu_to_le16(4000);
-	pSMB->MaxSetupCount = 0;
-	pSMB->Reserved = 0;
-	pSMB->Flags = 0;
-	pSMB->Timeout = 0;
-	pSMB->Reserved2 = 0;
-	pSMB->ParameterOffset = cpu_to_le16(
-		offsetof(struct smb_com_transaction2_qpi_req,
-			 InformationLevel) - 4);
-	pSMB->DataCount = 0;
-	pSMB->DataOffset = 0;
-	pSMB->SetupCount = 1;
-	pSMB->Reserved3 = 0;
-	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION);
-	byte_count = params + 1 /* pad */ ;
-	pSMB->TotalParameterCount = cpu_to_le16(params);
-	pSMB->ParameterCount = pSMB->TotalParameterCount;
-	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_POSIX_ACL);
-	pSMB->Reserved4 = 0;
-	inc_rfc1001_len(pSMB, byte_count);
-	pSMB->ByteCount = cpu_to_le16(byte_count);
-
-	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
-		(struct smb_hdr *) pSMBr, &bytes_returned, 0);
-	cifs_stats_inc(&tcon->stats.cifs_stats.num_acl_get);
-	if (rc) {
-		cifs_dbg(FYI, "Send error in Query POSIX ACL = %d\n", rc);
-	} else {
-		/* decode response */
-
-		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
-		/* BB also check enough total bytes returned */
-		if (rc || get_bcc(&pSMBr->hdr) < 2)
-			rc = -EIO;      /* bad smb */
-		else {
-			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
-			__u16 count = le16_to_cpu(pSMBr->t2.DataCount);
-			rc = cifs_copy_posix_acl(acl_inf,
-				(char *)&pSMBr->hdr.Protocol+data_offset,
-				buflen, acl_type, count);
-		}
-	}
-	cifs_buf_release(pSMB);
-	if (rc == -EAGAIN)
-		goto queryAclRetry;
-	return rc;
-}
-
-int
-CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
-		   const unsigned char *fileName,
-		   const char *local_acl, const int buflen,
-		   const int acl_type,
-		   const struct nls_table *nls_codepage, int remap)
-{
-	struct smb_com_transaction2_spi_req *pSMB = NULL;
-	struct smb_com_transaction2_spi_rsp *pSMBr = NULL;
-	char *parm_data;
-	int name_len;
-	int rc = 0;
-	int bytes_returned = 0;
-	__u16 params, byte_count, data_count, param_offset, offset;
-
-	cifs_dbg(FYI, "In SetPosixACL (Unix) for path %s\n", fileName);
-setAclRetry:
-	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
-		      (void **) &pSMBr);
-	if (rc)
-		return rc;
-	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
-		name_len =
-			cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
-					   PATH_MAX, nls_codepage, remap);
-		name_len++;     /* trailing null */
-		name_len *= 2;
-	} else {
-		name_len = copy_path_name(pSMB->FileName, fileName);
-	}
-	params = 6 + name_len;
-	pSMB->MaxParameterCount = cpu_to_le16(2);
-	/* BB find max SMB size from sess */
-	pSMB->MaxDataCount = cpu_to_le16(1000);
-	pSMB->MaxSetupCount = 0;
-	pSMB->Reserved = 0;
-	pSMB->Flags = 0;
-	pSMB->Timeout = 0;
-	pSMB->Reserved2 = 0;
-	param_offset = offsetof(struct smb_com_transaction2_spi_req,
-				InformationLevel) - 4;
-	offset = param_offset + params;
-	parm_data = ((char *) &pSMB->hdr.Protocol) + offset;
-	pSMB->ParameterOffset = cpu_to_le16(param_offset);
-
-	/* convert to on the wire format for POSIX ACL */
-	data_count = ACL_to_cifs_posix(parm_data, local_acl, buflen, acl_type);
-
-	if (data_count == 0) {
-		rc = -EOPNOTSUPP;
-		goto setACLerrorExit;
-	}
-	pSMB->DataOffset = cpu_to_le16(offset);
-	pSMB->SetupCount = 1;
-	pSMB->Reserved3 = 0;
-	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION);
-	pSMB->InformationLevel = cpu_to_le16(SMB_SET_POSIX_ACL);
-	byte_count = 3 /* pad */  + params + data_count;
-	pSMB->DataCount = cpu_to_le16(data_count);
-	pSMB->TotalDataCount = pSMB->DataCount;
-	pSMB->ParameterCount = cpu_to_le16(params);
-	pSMB->TotalParameterCount = pSMB->ParameterCount;
-	pSMB->Reserved4 = 0;
-	inc_rfc1001_len(pSMB, byte_count);
-	pSMB->ByteCount = cpu_to_le16(byte_count);
-	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
-			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
-	if (rc)
-		cifs_dbg(FYI, "Set POSIX ACL returned %d\n", rc);
-
-setACLerrorExit:
-	cifs_buf_release(pSMB);
-	if (rc == -EAGAIN)
-		goto setAclRetry;
-	return rc;
-}
-
 #ifdef CONFIG_FS_POSIX_ACL
 /**
  * cifs_init_posix_acl - convert ACL from cifs to POSIX ACL format
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 998fa51f9b68..5f2fb2fd2e37 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -200,32 +200,6 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
 		}
 		break;
 	}
-
-#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
-	case XATTR_ACL_ACCESS:
-#ifdef CONFIG_CIFS_POSIX
-		if (!value)
-			goto out;
-		if (sb->s_flags & SB_POSIXACL)
-			rc = CIFSSMBSetPosixACL(xid, pTcon, full_path,
-				value, (const int)size,
-				ACL_TYPE_ACCESS, cifs_sb->local_nls,
-				cifs_remap(cifs_sb));
-#endif  /* CONFIG_CIFS_POSIX */
-		break;
-
-	case XATTR_ACL_DEFAULT:
-#ifdef CONFIG_CIFS_POSIX
-		if (!value)
-			goto out;
-		if (sb->s_flags & SB_POSIXACL)
-			rc = CIFSSMBSetPosixACL(xid, pTcon, full_path,
-				value, (const int)size,
-				ACL_TYPE_DEFAULT, cifs_sb->local_nls,
-				cifs_remap(cifs_sb));
-#endif  /* CONFIG_CIFS_POSIX */
-		break;
-#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
 	}
 
 out:
@@ -366,27 +340,6 @@ static int cifs_xattr_get(const struct xattr_handler *handler,
 		}
 		break;
 	}
-#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
-	case XATTR_ACL_ACCESS:
-#ifdef CONFIG_CIFS_POSIX
-		if (sb->s_flags & SB_POSIXACL)
-			rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
-				value, size, ACL_TYPE_ACCESS,
-				cifs_sb->local_nls,
-				cifs_remap(cifs_sb));
-#endif  /* CONFIG_CIFS_POSIX */
-		break;
-
-	case XATTR_ACL_DEFAULT:
-#ifdef CONFIG_CIFS_POSIX
-		if (sb->s_flags & SB_POSIXACL)
-			rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
-				value, size, ACL_TYPE_DEFAULT,
-				cifs_sb->local_nls,
-				cifs_remap(cifs_sb));
-#endif  /* CONFIG_CIFS_POSIX */
-		break;
-#endif /* ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
 	}
 
 	/* We could add an additional check for streams ie
@@ -525,21 +478,6 @@ static const struct xattr_handler smb3_ntsd_full_xattr_handler = {
 	.set = cifs_xattr_set,
 };
 
-
-static const struct xattr_handler cifs_posix_acl_access_xattr_handler = {
-	.name = XATTR_NAME_POSIX_ACL_ACCESS,
-	.flags = XATTR_ACL_ACCESS,
-	.get = cifs_xattr_get,
-	.set = cifs_xattr_set,
-};
-
-static const struct xattr_handler cifs_posix_acl_default_xattr_handler = {
-	.name = XATTR_NAME_POSIX_ACL_DEFAULT,
-	.flags = XATTR_ACL_DEFAULT,
-	.get = cifs_xattr_get,
-	.set = cifs_xattr_set,
-};
-
 const struct xattr_handler *cifs_xattr_handlers[] = {
 	&cifs_user_xattr_handler,
 	&cifs_os2_xattr_handler,
@@ -549,7 +487,9 @@ const struct xattr_handler *cifs_xattr_handlers[] = {
 	&smb3_ntsd_xattr_handler, /* alias for above since avoiding "cifs" */
 	&cifs_cifs_ntsd_full_xattr_handler,
 	&smb3_ntsd_full_xattr_handler, /* alias for above since avoiding "cifs" */
-	&cifs_posix_acl_access_xattr_handler,
-	&cifs_posix_acl_default_xattr_handler,
+#ifdef CONFIG_FS_POSIX_ACL
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
+#endif
 	NULL
 };
-- 
cgit 


From 39a6497a9bbb760c3c26a1bef7a0ab0b9fdefd9f Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 22 Sep 2022 17:17:26 +0200
Subject: 9p: use stub posix acl handlers

Now that 9p supports the get and set acl inode operations and the vfs
has been switched to the new posi api, 9p can simply rely on the stub
posix acl handlers. The custom xattr handlers and associated unused
helpers can be removed.

Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/9p/acl.c   | 121 ----------------------------------------------------------
 fs/9p/xattr.c |   7 ++--
 fs/9p/xattr.h |   2 -
 3 files changed, 4 insertions(+), 126 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 135b26cee63a..c397c51f80d9 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -343,124 +343,3 @@ int v9fs_acl_mode(struct inode *dir, umode_t *modep,
 	*modep  = mode;
 	return 0;
 }
-
-static int v9fs_xattr_get_acl(const struct xattr_handler *handler,
-			      struct dentry *dentry, struct inode *inode,
-			      const char *name, void *buffer, size_t size)
-{
-	struct v9fs_session_info *v9ses;
-	struct posix_acl *acl;
-	int error;
-
-	v9ses = v9fs_dentry2v9ses(dentry);
-	/*
-	 * We allow set/get/list of acl when access=client is not specified
-	 */
-	if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
-		return v9fs_xattr_get(dentry, handler->name, buffer, size);
-
-	acl = v9fs_get_cached_acl(inode, handler->flags);
-	if (IS_ERR(acl))
-		return PTR_ERR(acl);
-	if (acl == NULL)
-		return -ENODATA;
-	error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
-	posix_acl_release(acl);
-
-	return error;
-}
-
-static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
-			      struct user_namespace *mnt_userns,
-			      struct dentry *dentry, struct inode *inode,
-			      const char *name, const void *value,
-			      size_t size, int flags)
-{
-	int retval;
-	struct posix_acl *acl;
-	struct v9fs_session_info *v9ses;
-
-	v9ses = v9fs_dentry2v9ses(dentry);
-	/*
-	 * set the attribute on the remote. Without even looking at the
-	 * xattr value. We leave it to the server to validate
-	 */
-	if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
-		return v9fs_xattr_set(dentry, handler->name, value, size,
-				      flags);
-
-	if (S_ISLNK(inode->i_mode))
-		return -EOPNOTSUPP;
-	if (!inode_owner_or_capable(&init_user_ns, inode))
-		return -EPERM;
-	if (value) {
-		/* update the cached acl value */
-		acl = posix_acl_from_xattr(&init_user_ns, value, size);
-		if (IS_ERR(acl))
-			return PTR_ERR(acl);
-		else if (acl) {
-			retval = posix_acl_valid(inode->i_sb->s_user_ns, acl);
-			if (retval)
-				goto err_out;
-		}
-	} else
-		acl = NULL;
-
-	switch (handler->flags) {
-	case ACL_TYPE_ACCESS:
-		if (acl) {
-			struct iattr iattr = { 0 };
-			struct posix_acl *old_acl = acl;
-
-			retval = posix_acl_update_mode(&init_user_ns, inode,
-						       &iattr.ia_mode, &acl);
-			if (retval)
-				goto err_out;
-			if (!acl) {
-				/*
-				 * ACL can be represented
-				 * by the mode bits. So don't
-				 * update ACL.
-				 */
-				posix_acl_release(old_acl);
-				value = NULL;
-				size = 0;
-			}
-			iattr.ia_valid = ATTR_MODE;
-			/* FIXME should we update ctime ?
-			 * What is the following setxattr update the
-			 * mode ?
-			 */
-			v9fs_vfs_setattr_dotl(&init_user_ns, dentry, &iattr);
-		}
-		break;
-	case ACL_TYPE_DEFAULT:
-		if (!S_ISDIR(inode->i_mode)) {
-			retval = acl ? -EINVAL : 0;
-			goto err_out;
-		}
-		break;
-	default:
-		BUG();
-	}
-	retval = v9fs_xattr_set(dentry, handler->name, value, size, flags);
-	if (!retval)
-		set_cached_acl(inode, handler->flags, acl);
-err_out:
-	posix_acl_release(acl);
-	return retval;
-}
-
-const struct xattr_handler v9fs_xattr_acl_access_handler = {
-	.name	= XATTR_NAME_POSIX_ACL_ACCESS,
-	.flags	= ACL_TYPE_ACCESS,
-	.get	= v9fs_xattr_get_acl,
-	.set	= v9fs_xattr_set_acl,
-};
-
-const struct xattr_handler v9fs_xattr_acl_default_handler = {
-	.name	= XATTR_NAME_POSIX_ACL_DEFAULT,
-	.flags	= ACL_TYPE_DEFAULT,
-	.get	= v9fs_xattr_get_acl,
-	.set	= v9fs_xattr_set_acl,
-};
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index 1f9298a4bd42..ae6a93871338 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -8,6 +8,7 @@
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/uio.h>
+#include <linux/posix_acl_xattr.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 
@@ -182,9 +183,9 @@ static struct xattr_handler v9fs_xattr_security_handler = {
 const struct xattr_handler *v9fs_xattr_handlers[] = {
 	&v9fs_xattr_user_handler,
 	&v9fs_xattr_trusted_handler,
-#ifdef CONFIG_9P_FS_POSIX_ACL
-	&v9fs_xattr_acl_access_handler,
-	&v9fs_xattr_acl_default_handler,
+#ifdef CONFIG_FS_POSIX_ACL
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
 #endif
 #ifdef CONFIG_9P_FS_SECURITY
 	&v9fs_xattr_security_handler,
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
index 3e11fc3331eb..b5636e544c8a 100644
--- a/fs/9p/xattr.h
+++ b/fs/9p/xattr.h
@@ -11,8 +11,6 @@
 #include <net/9p/client.h>
 
 extern const struct xattr_handler *v9fs_xattr_handlers[];
-extern const struct xattr_handler v9fs_xattr_acl_access_handler;
-extern const struct xattr_handler v9fs_xattr_acl_default_handler;
 
 ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
 			   void *buffer, size_t buffer_size);
-- 
cgit 


From 0a26bde2c9db9817e2b4c0f890236f78d4d8ed7c Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 22 Sep 2022 17:17:27 +0200
Subject: acl: remove a slew of now unused helpers

Now that the posix acl api is active we can remove all the hacky helpers
we had to keep around for all these years and also remove the set and
get posix acl xattr handler methods as they aren't needed anymore.

Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/posix_acl.c | 363 ++++-----------------------------------------------------
 fs/xattr.c     |   5 +-
 2 files changed, 23 insertions(+), 345 deletions(-)

(limited to 'fs')

diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 14b87653b797..acb7f7d7f11d 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -747,118 +747,32 @@ static int posix_acl_fix_xattr_common(const void *value, size_t size)
 	return count;
 }
 
-void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns,
-				     const struct inode *inode,
-				     void *value, size_t size)
-{
-	struct posix_acl_xattr_header *header = value;
-	struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end;
-	struct user_namespace *fs_userns = i_user_ns(inode);
-	int count;
-	vfsuid_t vfsuid;
-	vfsgid_t vfsgid;
-	kuid_t uid;
-	kgid_t gid;
-
-	if (no_idmapping(mnt_userns, i_user_ns(inode)))
-		return;
-
-	count = posix_acl_fix_xattr_common(value, size);
-	if (count <= 0)
-		return;
-
-	for (end = entry + count; entry != end; entry++) {
-		switch (le16_to_cpu(entry->e_tag)) {
-		case ACL_USER:
-			uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id));
-			vfsuid = make_vfsuid(mnt_userns, fs_userns, uid);
-			entry->e_id = cpu_to_le32(from_kuid(&init_user_ns,
-						vfsuid_into_kuid(vfsuid)));
-			break;
-		case ACL_GROUP:
-			gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id));
-			vfsgid = make_vfsgid(mnt_userns, fs_userns, gid);
-			entry->e_id = cpu_to_le32(from_kgid(&init_user_ns,
-						vfsgid_into_kgid(vfsgid)));
-			break;
-		default:
-			break;
-		}
-	}
-}
-
-static void posix_acl_fix_xattr_userns(
-	struct user_namespace *to, struct user_namespace *from,
-	void *value, size_t size)
-{
-	struct posix_acl_xattr_header *header = value;
-	struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end;
-	int count;
-	kuid_t uid;
-	kgid_t gid;
-
-	count = posix_acl_fix_xattr_common(value, size);
-	if (count <= 0)
-		return;
-
-	for (end = entry + count; entry != end; entry++) {
-		switch(le16_to_cpu(entry->e_tag)) {
-		case ACL_USER:
-			uid = make_kuid(from, le32_to_cpu(entry->e_id));
-			entry->e_id = cpu_to_le32(from_kuid(to, uid));
-			break;
-		case ACL_GROUP:
-			gid = make_kgid(from, le32_to_cpu(entry->e_id));
-			entry->e_id = cpu_to_le32(from_kgid(to, gid));
-			break;
-		default:
-			break;
-		}
-	}
-}
-
-void posix_acl_fix_xattr_from_user(void *value, size_t size)
-{
-	struct user_namespace *user_ns = current_user_ns();
-	if (user_ns == &init_user_ns)
-		return;
-	posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size);
-}
-
-void posix_acl_fix_xattr_to_user(void *value, size_t size)
-{
-	struct user_namespace *user_ns = current_user_ns();
-	if (user_ns == &init_user_ns)
-		return;
-	posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size);
-}
-
 /**
- * make_posix_acl - convert POSIX ACLs from uapi to VFS format using the
- *                  provided callbacks to map ACL_{GROUP,USER} entries into the
- *                  appropriate format
- * @mnt_userns: the mount's idmapping
- * @fs_userns: the filesystem's idmapping
+ * posix_acl_from_xattr - convert POSIX ACLs from backing store to VFS format
+ * @userns: the filesystem's idmapping
  * @value: the uapi representation of POSIX ACLs
  * @size: the size of @void
- * @uid_cb: callback to use for mapping the uid stored in ACL_USER entries
- * @gid_cb: callback to use for mapping the gid stored in ACL_GROUP entries
  *
- * The make_posix_acl() helper is an abstraction to translate from uapi format
- * into the VFS format allowing the caller to specific callbacks to map
- * ACL_{GROUP,USER} entries into the expected format. This is used in
- * posix_acl_from_xattr() and vfs_set_acl_prepare() and avoids pointless code
- * duplication.
+ * Filesystems that store POSIX ACLs in the unaltered uapi format should use
+ * posix_acl_from_xattr() when reading them from the backing store and
+ * converting them into the struct posix_acl VFS format. The helper is
+ * specifically intended to be called from the acl inode operation.
+ *
+ * The posix_acl_from_xattr() function will map the raw {g,u}id values stored
+ * in ACL_{GROUP,USER} entries into idmapping in @userns.
+ *
+ * Note that posix_acl_from_xattr() does not take idmapped mounts into account.
+ * If it did it calling it from the get acl inode operation would return POSIX
+ * ACLs mapped according to an idmapped mount which would mean that the value
+ * couldn't be cached for the filesystem. Idmapped mounts are taken into
+ * account on the fly during permission checking or right at the VFS -
+ * userspace boundary before reporting them to the user.
  *
  * Return: Allocated struct posix_acl on success, NULL for a valid header but
  *         without actual POSIX ACL entries, or ERR_PTR() encoded error code.
  */
-static struct posix_acl *make_posix_acl(struct user_namespace *mnt_userns,
-	struct user_namespace *fs_userns, const void *value, size_t size,
-	kuid_t (*uid_cb)(struct user_namespace *, struct user_namespace *,
-			 const struct posix_acl_xattr_entry *),
-	kgid_t (*gid_cb)(struct user_namespace *, struct user_namespace *,
-			 const struct posix_acl_xattr_entry *))
+struct posix_acl *posix_acl_from_xattr(struct user_namespace *userns,
+				       const void *value, size_t size)
 {
 	const struct posix_acl_xattr_header *header = value;
 	const struct posix_acl_xattr_entry *entry = (const void *)(header + 1), *end;
@@ -889,12 +803,14 @@ static struct posix_acl *make_posix_acl(struct user_namespace *mnt_userns,
 				break;
 
 			case ACL_USER:
-				acl_e->e_uid = uid_cb(mnt_userns, fs_userns, entry);
+				acl_e->e_uid = make_kuid(userns,
+						le32_to_cpu(entry->e_id));
 				if (!uid_valid(acl_e->e_uid))
 					goto fail;
 				break;
 			case ACL_GROUP:
-				acl_e->e_gid = gid_cb(mnt_userns, fs_userns, entry);
+				acl_e->e_gid = make_kgid(userns,
+						le32_to_cpu(entry->e_id));
 				if (!gid_valid(acl_e->e_gid))
 					goto fail;
 				break;
@@ -909,182 +825,6 @@ fail:
 	posix_acl_release(acl);
 	return ERR_PTR(-EINVAL);
 }
-
-/**
- * vfs_set_acl_prepare_kuid - map ACL_USER uid according to mount- and
- *                            filesystem idmapping
- * @mnt_userns: the mount's idmapping
- * @fs_userns: the filesystem's idmapping
- * @e: a ACL_USER entry in POSIX ACL uapi format
- *
- * The uid stored as ACL_USER entry in @e is a kuid_t stored as a raw {g,u}id
- * value. The vfs_set_acl_prepare_kuid() will recover the kuid_t through
- * KUIDT_INIT() and then map it according to the idmapped mount. The resulting
- * kuid_t is the value which the filesystem can map up into a raw backing store
- * id in the filesystem's idmapping.
- *
- * This is used in vfs_set_acl_prepare() to generate the proper VFS
- * representation of POSIX ACLs with ACL_USER entries during setxattr().
- *
- * Return: A kuid in @fs_userns for the uid stored in @e.
- */
-static inline kuid_t
-vfs_set_acl_prepare_kuid(struct user_namespace *mnt_userns,
-			 struct user_namespace *fs_userns,
-			 const struct posix_acl_xattr_entry *e)
-{
-	kuid_t kuid = KUIDT_INIT(le32_to_cpu(e->e_id));
-	return from_vfsuid(mnt_userns, fs_userns, VFSUIDT_INIT(kuid));
-}
-
-/**
- * vfs_set_acl_prepare_kgid - map ACL_GROUP gid according to mount- and
- *                            filesystem idmapping
- * @mnt_userns: the mount's idmapping
- * @fs_userns: the filesystem's idmapping
- * @e: a ACL_GROUP entry in POSIX ACL uapi format
- *
- * The gid stored as ACL_GROUP entry in @e is a kgid_t stored as a raw {g,u}id
- * value. The vfs_set_acl_prepare_kgid() will recover the kgid_t through
- * KGIDT_INIT() and then map it according to the idmapped mount. The resulting
- * kgid_t is the value which the filesystem can map up into a raw backing store
- * id in the filesystem's idmapping.
- *
- * This is used in vfs_set_acl_prepare() to generate the proper VFS
- * representation of POSIX ACLs with ACL_GROUP entries during setxattr().
- *
- * Return: A kgid in @fs_userns for the gid stored in @e.
- */
-static inline kgid_t
-vfs_set_acl_prepare_kgid(struct user_namespace *mnt_userns,
-			 struct user_namespace *fs_userns,
-			 const struct posix_acl_xattr_entry *e)
-{
-	kgid_t kgid = KGIDT_INIT(le32_to_cpu(e->e_id));
-	return from_vfsgid(mnt_userns, fs_userns, VFSGIDT_INIT(kgid));
-}
-
-/**
- * vfs_set_acl_prepare - convert POSIX ACLs from uapi to VFS format taking
- *                       mount and filesystem idmappings into account
- * @mnt_userns: the mount's idmapping
- * @fs_userns: the filesystem's idmapping
- * @value: the uapi representation of POSIX ACLs
- * @size: the size of @void
- *
- * When setting POSIX ACLs with ACL_{GROUP,USER} entries they need to be
- * mapped according to the relevant mount- and filesystem idmapping. It is
- * important that the ACL_{GROUP,USER} entries in struct posix_acl will be
- * mapped into k{g,u}id_t that are supposed to be mapped up in the filesystem
- * idmapping. This is crucial since the resulting struct posix_acl might be
- * cached filesystem wide. The vfs_set_acl_prepare() function will take care to
- * perform all necessary idmappings.
- *
- * Note, that since basically forever the {g,u}id values encoded as
- * ACL_{GROUP,USER} entries in the uapi POSIX ACLs passed via @value contain
- * values that have been mapped according to the caller's idmapping. In other
- * words, POSIX ACLs passed in uapi format as @value during setxattr() contain
- * {g,u}id values in their ACL_{GROUP,USER} entries that should actually have
- * been stored as k{g,u}id_t.
- *
- * This means, vfs_set_acl_prepare() needs to first recover the k{g,u}id_t by
- * calling K{G,U}IDT_INIT(). Afterwards they can be interpreted as vfs{g,u}id_t
- * through from_vfs{g,u}id() to account for any idmapped mounts. The
- * vfs_set_acl_prepare_k{g,u}id() helpers will take care to generate the
- * correct k{g,u}id_t.
- *
- * The filesystem will then receive the POSIX ACLs ready to be cached
- * filesystem wide and ready to be written to the backing store taking the
- * filesystem's idmapping into account.
- *
- * Return: Allocated struct posix_acl on success, NULL for a valid header but
- *         without actual POSIX ACL entries, or ERR_PTR() encoded error code.
- */
-struct posix_acl *vfs_set_acl_prepare(struct user_namespace *mnt_userns,
-				      struct user_namespace *fs_userns,
-				      const void *value, size_t size)
-{
-	return make_posix_acl(mnt_userns, fs_userns, value, size,
-			      vfs_set_acl_prepare_kuid,
-			      vfs_set_acl_prepare_kgid);
-}
-EXPORT_SYMBOL(vfs_set_acl_prepare);
-
-/**
- * posix_acl_from_xattr_kuid - map ACL_USER uid into filesystem idmapping
- * @mnt_userns: unused
- * @fs_userns: the filesystem's idmapping
- * @e: a ACL_USER entry in POSIX ACL uapi format
- *
- * Map the uid stored as ACL_USER entry in @e into the filesystem's idmapping.
- * This is used in posix_acl_from_xattr() to generate the proper VFS
- * representation of POSIX ACLs with ACL_USER entries.
- *
- * Return: A kuid in @fs_userns for the uid stored in @e.
- */
-static inline kuid_t
-posix_acl_from_xattr_kuid(struct user_namespace *mnt_userns,
-			  struct user_namespace *fs_userns,
-			  const struct posix_acl_xattr_entry *e)
-{
-	return make_kuid(fs_userns, le32_to_cpu(e->e_id));
-}
-
-/**
- * posix_acl_from_xattr_kgid - map ACL_GROUP gid into filesystem idmapping
- * @mnt_userns: unused
- * @fs_userns: the filesystem's idmapping
- * @e: a ACL_GROUP entry in POSIX ACL uapi format
- *
- * Map the gid stored as ACL_GROUP entry in @e into the filesystem's idmapping.
- * This is used in posix_acl_from_xattr() to generate the proper VFS
- * representation of POSIX ACLs with ACL_GROUP entries.
- *
- * Return: A kgid in @fs_userns for the gid stored in @e.
- */
-static inline kgid_t
-posix_acl_from_xattr_kgid(struct user_namespace *mnt_userns,
-			  struct user_namespace *fs_userns,
-			  const struct posix_acl_xattr_entry *e)
-{
-	return make_kgid(fs_userns, le32_to_cpu(e->e_id));
-}
-
-/**
- * posix_acl_from_xattr - convert POSIX ACLs from backing store to VFS format
- * @fs_userns: the filesystem's idmapping
- * @value: the uapi representation of POSIX ACLs
- * @size: the size of @void
- *
- * Filesystems that store POSIX ACLs in the unaltered uapi format should use
- * posix_acl_from_xattr() when reading them from the backing store and
- * converting them into the struct posix_acl VFS format. The helper is
- * specifically intended to be called from the ->get_inode_acl() inode
- * operation.
- *
- * The posix_acl_from_xattr() function will map the raw {g,u}id values stored
- * in ACL_{GROUP,USER} entries into the filesystem idmapping in @fs_userns. The
- * posix_acl_from_xattr_k{g,u}id() helpers will take care to generate the
- * correct k{g,u}id_t. The returned struct posix_acl can be cached.
- *
- * Note that posix_acl_from_xattr() does not take idmapped mounts into account.
- * If it did it calling is from the ->get_inode_acl() inode operation would
- * return POSIX ACLs mapped according to an idmapped mount which would mean
- * that the value couldn't be cached for the filesystem. Idmapped mounts are
- * taken into account on the fly during permission checking or right at the VFS
- * - userspace boundary before reporting them to the user.
- *
- * Return: Allocated struct posix_acl on success, NULL for a valid header but
- *         without actual POSIX ACL entries, or ERR_PTR() encoded error code.
- */
-struct posix_acl *
-posix_acl_from_xattr(struct user_namespace *fs_userns,
-		     const void *value, size_t size)
-{
-	return make_posix_acl(&init_user_ns, fs_userns, value, size,
-			      posix_acl_from_xattr_kuid,
-			      posix_acl_from_xattr_kgid);
-}
 EXPORT_SYMBOL (posix_acl_from_xattr);
 
 /*
@@ -1190,31 +930,6 @@ ssize_t vfs_posix_acl_to_xattr(struct user_namespace *mnt_userns,
 	return real_size;
 }
 
-static int
-posix_acl_xattr_get(const struct xattr_handler *handler,
-		    struct dentry *unused, struct inode *inode,
-		    const char *name, void *value, size_t size)
-{
-	struct posix_acl *acl;
-	int error;
-
-	if (!IS_POSIXACL(inode))
-		return -EOPNOTSUPP;
-	if (S_ISLNK(inode->i_mode))
-		return -EOPNOTSUPP;
-
-	acl = get_inode_acl(inode, handler->flags);
-	if (IS_ERR(acl))
-		return PTR_ERR(acl);
-	if (acl == NULL)
-		return -ENODATA;
-
-	error = posix_acl_to_xattr(&init_user_ns, acl, value, size);
-	posix_acl_release(acl);
-
-	return error;
-}
-
 int
 set_posix_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 	      int type, struct posix_acl *acl)
@@ -1240,36 +955,6 @@ set_posix_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 }
 EXPORT_SYMBOL(set_posix_acl);
 
-static int
-posix_acl_xattr_set(const struct xattr_handler *handler,
-			   struct user_namespace *mnt_userns,
-			   struct dentry *dentry, struct inode *inode,
-			   const char *name, const void *value, size_t size,
-			   int flags)
-{
-	struct posix_acl *acl = NULL;
-	int ret;
-
-	if (value) {
-		/*
-		 * By the time we end up here the {g,u}ids stored in
-		 * ACL_{GROUP,USER} have already been mapped according to the
-		 * caller's idmapping. The vfs_set_acl_prepare() helper will
-		 * recover them and take idmapped mounts into account. The
-		 * filesystem will receive the POSIX ACLs in the correct
-		 * format ready to be cached or written to the backing store
-		 * taking the filesystem idmapping into account.
-		 */
-		acl = vfs_set_acl_prepare(mnt_userns, i_user_ns(inode),
-					  value, size);
-		if (IS_ERR(acl))
-			return PTR_ERR(acl);
-	}
-	ret = set_posix_acl(mnt_userns, dentry, handler->flags, acl);
-	posix_acl_release(acl);
-	return ret;
-}
-
 static bool
 posix_acl_xattr_list(struct dentry *dentry)
 {
@@ -1280,8 +965,6 @@ const struct xattr_handler posix_acl_access_xattr_handler = {
 	.name = XATTR_NAME_POSIX_ACL_ACCESS,
 	.flags = ACL_TYPE_ACCESS,
 	.list = posix_acl_xattr_list,
-	.get = posix_acl_xattr_get,
-	.set = posix_acl_xattr_set,
 };
 EXPORT_SYMBOL_GPL(posix_acl_access_xattr_handler);
 
@@ -1289,8 +972,6 @@ const struct xattr_handler posix_acl_default_xattr_handler = {
 	.name = XATTR_NAME_POSIX_ACL_DEFAULT,
 	.flags = ACL_TYPE_DEFAULT,
 	.list = posix_acl_xattr_list,
-	.get = posix_acl_xattr_get,
-	.set = posix_acl_xattr_set,
 };
 EXPORT_SYMBOL_GPL(posix_acl_default_xattr_handler);
 
diff --git a/fs/xattr.c b/fs/xattr.c
index 77db5aa26d13..df3af9fa8c77 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -454,10 +454,7 @@ vfs_getxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 		return ret;
 	}
 nolsm:
-	error = __vfs_getxattr(dentry, inode, name, value, size);
-	if (error > 0 && is_posix_acl_xattr(name))
-		posix_acl_getxattr_idmapped_mnt(mnt_userns, inode, value, size);
-	return error;
+	return __vfs_getxattr(dentry, inode, name, value, size);
 }
 EXPORT_SYMBOL_GPL(vfs_getxattr);
 
-- 
cgit 


From 6a542d1d5f6c814fd3643b43e85b21757c1e363b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 8 Jun 2020 12:21:07 -0400
Subject: kill signal_pt_regs()

Once upon at it was used on hot paths, but that had not been
true since 2013.  IOW, there's no point for arch-optimized
equivalent of task_pt_regs(current) - remaining two users are
not worth bothering with.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/coredump.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/coredump.c b/fs/coredump.c
index 7bad7785e8e6..b4ec1bf889f9 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -525,7 +525,7 @@ void do_coredump(const kernel_siginfo_t *siginfo)
 	static atomic_t core_dump_count = ATOMIC_INIT(0);
 	struct coredump_params cprm = {
 		.siginfo = siginfo,
-		.regs = signal_pt_regs(),
+		.regs = task_pt_regs(current),
 		.limit = rlimit(RLIMIT_CORE),
 		/*
 		 * We must use the same mm->flags while dumping core to avoid
-- 
cgit 


From 9a938eba8d284fba0daff62142dece74ae3c16de Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 8 Jun 2020 12:25:39 -0400
Subject: kill coredump_params->regs

it's always task_pt_regs(current)

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/binfmt_elf.c | 4 ++--
 fs/coredump.c   | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 63c7ebb0da89..002fd713ac11 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -2082,7 +2082,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 	/* now collect the dump for the current */
 	memset(info->prstatus, 0, sizeof(*info->prstatus));
 	fill_prstatus(&info->prstatus->common, current, cprm->siginfo->si_signo);
-	elf_core_copy_regs(&info->prstatus->pr_reg, cprm->regs);
+	elf_core_copy_regs(&info->prstatus->pr_reg, task_pt_regs(current));
 
 	/* Set up header */
 	fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS);
@@ -2109,7 +2109,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 
 	/* Try to dump the FPU. */
 	info->prstatus->pr_fpvalid =
-		elf_core_copy_task_fpregs(current, cprm->regs, info->fpu);
+		elf_core_copy_task_fpregs(current, task_pt_regs(current), info->fpu);
 	if (info->prstatus->pr_fpvalid)
 		fill_note(info->notes + info->numnote++,
 			  "CORE", NT_PRFPREG, sizeof(*info->fpu), info->fpu);
diff --git a/fs/coredump.c b/fs/coredump.c
index b4ec1bf889f9..1a474de1e52b 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -525,7 +525,6 @@ void do_coredump(const kernel_siginfo_t *siginfo)
 	static atomic_t core_dump_count = ATOMIC_INIT(0);
 	struct coredump_params cprm = {
 		.siginfo = siginfo,
-		.regs = task_pt_regs(current),
 		.limit = rlimit(RLIMIT_CORE),
 		/*
 		 * We must use the same mm->flags while dumping core to avoid
-- 
cgit 


From 922ef161b21e605bf803f65dc928fabefb735702 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 5 Sep 2022 00:39:23 -0400
Subject: [elf][regset] clean fill_note_info() a bit

*info is already initialized...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/binfmt_elf.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 002fd713ac11..4190dafd2ec4 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1833,24 +1833,17 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 	struct elf_thread_core_info *t;
 	struct elf_prpsinfo *psinfo;
 	struct core_thread *ct;
-	unsigned int i;
-
-	info->size = 0;
-	info->thread = NULL;
 
 	psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
-	if (psinfo == NULL) {
-		info->psinfo.data = NULL; /* So we don't free this wrongly */
+	if (!psinfo)
 		return 0;
-	}
-
 	fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
 
 	/*
 	 * Figure out how many notes we're going to need for each thread.
 	 */
 	info->thread_notes = 0;
-	for (i = 0; i < view->n; ++i)
+	for (int i = 0; i < view->n; ++i)
 		if (view->regsets[i].core_note_type != 0)
 			++info->thread_notes;
 
-- 
cgit 


From 4b0e21d64253f56fa5b177e08383934680957697 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 8 Jun 2020 13:44:16 -0400
Subject: [elf][regset] simplify thread list handling in fill_note_info()

fill_note_info() iterates through the list of threads collected in
mm->core_state->dumper, allocating a struct elf_thread_core_info
instance for each and linking those into a list.

We need the entry corresponding to current to be first in the
resulting list, so the logics for list insertion is
	if it's for current or list is empty
		insert in the head
	else
		insert after the first element

However, in mm->core_state->dumper the entry for current is guaranteed
to be the first one.  Which means that both parts of condition will
be true on the first iteration and neither will be true on all subsequent
ones.

Taking the first iteration out of the loop simplifies things nicely...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/binfmt_elf.c | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 4190dafd2ec4..e990075fb43d 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1866,7 +1866,14 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 	/*
 	 * Allocate a structure for each thread.
 	 */
-	for (ct = &dump_task->signal->core_state->dumper; ct; ct = ct->next) {
+	info->thread = kzalloc(offsetof(struct elf_thread_core_info,
+				     notes[info->thread_notes]),
+			    GFP_KERNEL);
+	if (unlikely(!info->thread))
+		return 0;
+
+	info->thread->task = dump_task;
+	for (ct = dump_task->signal->core_state->dumper.next; ct; ct = ct->next) {
 		t = kzalloc(offsetof(struct elf_thread_core_info,
 				     notes[info->thread_notes]),
 			    GFP_KERNEL);
@@ -1874,17 +1881,8 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 			return 0;
 
 		t->task = ct->task;
-		if (ct->task == dump_task || !info->thread) {
-			t->next = info->thread;
-			info->thread = t;
-		} else {
-			/*
-			 * Make sure to keep the original task at
-			 * the head of the list.
-			 */
-			t->next = info->thread->next;
-			info->thread->next = t;
-		}
+		t->next = info->thread->next;
+		info->thread->next = t;
 	}
 
 	/*
-- 
cgit 


From 3aca47127a646165965ff52803e2b269eed91afc Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Thu, 22 Sep 2022 13:25:25 -0400
Subject: fs: drop useless condition from inode_needs_update_time

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index b608528efd3a..762feb8d0c6c 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2071,9 +2071,6 @@ static int inode_needs_update_time(struct inode *inode, struct timespec64 *now)
 	if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
 		sync_it |= S_VERSION;
 
-	if (!sync_it)
-		return 0;
-
 	return sync_it;
 }
 
-- 
cgit 


From 0dafb7e671f0e769c854609548037754a68dcbc6 Mon Sep 17 00:00:00 2001
From: Li zeming <zeming@nfschina.com>
Date: Wed, 12 Oct 2022 18:42:35 +0800
Subject: fs: udf: Optimize udf_free_in_core_inode and udf_find_fileset
 function

These two functions perform the following optimizations.
1. Delete the type cast of foo pointer. Void * does not need to convert
the type.
2. Delete the initialization assignment of bh variable, which is
assigned first.

Signed-off-by: Li zeming <zeming@nfschina.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20221012104235.3331-1-zeming@nfschina.com
---
 fs/udf/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/super.c b/fs/udf/super.c
index 4042d9739fb7..06eda8177b5f 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -162,7 +162,7 @@ static void udf_free_in_core_inode(struct inode *inode)
 
 static void init_once(void *foo)
 {
-	struct udf_inode_info *ei = (struct udf_inode_info *)foo;
+	struct udf_inode_info *ei = foo;
 
 	ei->i_data = NULL;
 	inode_init_once(&ei->vfs_inode);
@@ -820,7 +820,7 @@ static int udf_find_fileset(struct super_block *sb,
 			    struct kernel_lb_addr *fileset,
 			    struct kernel_lb_addr *root)
 {
-	struct buffer_head *bh = NULL;
+	struct buffer_head *bh;
 	uint16_t ident;
 	int ret;
 
-- 
cgit 


From c791730f2554a9ebb8f18df9368dc27d4ebc38c2 Mon Sep 17 00:00:00 2001
From: Shigeru Yoshida <syoshida@redhat.com>
Date: Sun, 23 Oct 2022 18:57:41 +0900
Subject: udf: Avoid double brelse() in udf_rename()

syzbot reported a warning like below [1]:

VFS: brelse: Trying to free free buffer
WARNING: CPU: 2 PID: 7301 at fs/buffer.c:1145 __brelse+0x67/0xa0
...
Call Trace:
 <TASK>
 invalidate_bh_lru+0x99/0x150
 smp_call_function_many_cond+0xe2a/0x10c0
 ? generic_remap_file_range_prep+0x50/0x50
 ? __brelse+0xa0/0xa0
 ? __mutex_lock+0x21c/0x12d0
 ? smp_call_on_cpu+0x250/0x250
 ? rcu_read_lock_sched_held+0xb/0x60
 ? lock_release+0x587/0x810
 ? __brelse+0xa0/0xa0
 ? generic_remap_file_range_prep+0x50/0x50
 on_each_cpu_cond_mask+0x3c/0x80
 blkdev_flush_mapping+0x13a/0x2f0
 blkdev_put_whole+0xd3/0xf0
 blkdev_put+0x222/0x760
 deactivate_locked_super+0x96/0x160
 deactivate_super+0xda/0x100
 cleanup_mnt+0x222/0x3d0
 task_work_run+0x149/0x240
 ? task_work_cancel+0x30/0x30
 do_exit+0xb29/0x2a40
 ? reacquire_held_locks+0x4a0/0x4a0
 ? do_raw_spin_lock+0x12a/0x2b0
 ? mm_update_next_owner+0x7c0/0x7c0
 ? rwlock_bug.part.0+0x90/0x90
 ? zap_other_threads+0x234/0x2d0
 do_group_exit+0xd0/0x2a0
 __x64_sys_exit_group+0x3a/0x50
 do_syscall_64+0x34/0xb0
 entry_SYSCALL_64_after_hwframe+0x63/0xcd

The cause of the issue is that brelse() is called on both ofibh.sbh
and ofibh.ebh by udf_find_entry() when it returns NULL.  However,
brelse() is called by udf_rename(), too.  So, b_count on buffer_head
becomes unbalanced.

This patch fixes the issue by not calling brelse() by udf_rename()
when udf_find_entry() returns NULL.

Link: https://syzkaller.appspot.com/bug?id=8297f45698159c6bca8a1f87dc983667c1a1c851 [1]
Reported-by: syzbot+7902cd7684bc35306224@syzkaller.appspotmail.com
Signed-off-by: Shigeru Yoshida <syoshida@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20221023095741.271430-1-syoshida@redhat.com
---
 fs/udf/namei.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index fb4c30e05245..d6081538bfc0 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -1091,8 +1091,9 @@ static int udf_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
 		return -EINVAL;
 
 	ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
-	if (IS_ERR(ofi)) {
-		retval = PTR_ERR(ofi);
+	if (!ofi || IS_ERR(ofi)) {
+		if (IS_ERR(ofi))
+			retval = PTR_ERR(ofi);
 		goto end_rename;
 	}
 
@@ -1101,8 +1102,7 @@ static int udf_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
 
 	brelse(ofibh.sbh);
 	tloc = lelb_to_cpu(ocfi.icb.extLocation);
-	if (!ofi || udf_get_lb_pblock(old_dir->i_sb, &tloc, 0)
-	    != old_inode->i_ino)
+	if (udf_get_lb_pblock(old_dir->i_sb, &tloc, 0) != old_inode->i_ino)
 		goto end_rename;
 
 	nfi = udf_find_entry(new_dir, &new_dentry->d_name, &nfibh, &ncfi);
-- 
cgit 


From c3db3c2fd9992c08f49aa93752d3c103c3a4f6aa Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@denx.de>
Date: Mon, 24 Oct 2022 19:30:12 +0200
Subject: f2fs: should put a page when checking the summary info

The commit introduces another bug.

Cc: stable@vger.kernel.org
Fixes: c6ad7fd16657e ("f2fs: fix to do sanity check on summary info")
Signed-off-by: Pavel Machek <pavel@denx.de>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/gc.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 4546e01b2ee0..dab794225cce 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1110,6 +1110,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 	if (ofs_in_node >= max_addrs) {
 		f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%u, nid:%u, max:%u",
 			ofs_in_node, dni->ino, dni->nid, max_addrs);
+		f2fs_put_page(node_page, 1);
 		return false;
 	}
 
-- 
cgit 


From 14dc00a0e2dbea4b685ab9723ff511fcfd223c18 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 17 Oct 2022 17:52:05 -0700
Subject: f2fs: let's avoid to get cp_rwsem twice by f2fs_evict_inode by
 d_invalidate

f2fs_unlink
 -> f2fs_lock_op
 -> d_invalidate
  -> shrink_dentry_list
   -> iput_final
    -> f2fs_evict_inode
     -> f2fs_lock_op

Reviewed-by: Chao Yu <chao@kernel.org>
Tested-by: Yangtao Li <frank.li@vivo.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/namei.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index a389772fd212..e104409c3a0e 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -632,6 +632,8 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
 		goto fail;
 	}
 	f2fs_delete_entry(de, page, dir, inode);
+	f2fs_unlock_op(sbi);
+
 #if IS_ENABLED(CONFIG_UNICODE)
 	/* VFS negative dentries are incompatible with Encoding and
 	 * Case-insensitiveness. Eventually we'll want avoid
@@ -642,8 +644,6 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
 	if (IS_CASEFOLDED(dir))
 		d_invalidate(dentry);
 #endif
-	f2fs_unlock_op(sbi);
-
 	if (IS_DIRSYNC(dir))
 		f2fs_sync_fs(sbi->sb, 1);
 fail:
-- 
cgit 


From 2b5f9dad32ed19e8db3b0f10a84aa824a219803b Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@gmail.com>
Date: Tue, 20 Sep 2022 17:31:19 -0700
Subject: fs/exec: switch timens when a task gets a new mm

Changing a time namespace requires remapping a vvar page, so we don't want
to allow doing that if any other tasks can use the same mm.

Currently, we install a time namespace when a task is created with a new
vm. exec() is another case when a task gets a new mm and so it can switch
a time namespace safely, but it isn't handled now.

One more issue of the current interface is that clone() with CLONE_VM isn't
allowed if the current task has unshared a time namespace
(timens_for_children doesn't match the current timens).

Both these issues make some inconvenience for users. For example, Alexey
and Florian reported that posix_spawn() uses vfork+exec and this pattern
doesn't work with time namespaces due to the both described issues.
LXC needed to workaround the exec() issue by calling setns.

In the commit 133e2d3e81de5 ("fs/exec: allow to unshare a time namespace on
vfork+exec"), we tried to fix these issues with minimal impact on UAPI. But
it adds extra complexity and some undesirable side effects. Eric suggested
fixing the issues properly because here are all the reasons to suppose that
there are no users that depend on the old behavior.

Cc: Alexey Izbyshev <izbyshev@ispras.ru>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Dmitry Safonov <0x7f454c46@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Florian Weimer <fweimer@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Suggested-by: "Eric W. Biederman" <ebiederm@xmission.com>
Origin-author: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrei Vagin <avagin@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220921003120.209637-1-avagin@google.com
---
 fs/exec.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 32dc8cf5fceb..34e6a2e53268 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -64,6 +64,7 @@
 #include <linux/io_uring.h>
 #include <linux/syscall_user_dispatch.h>
 #include <linux/coredump.h>
+#include <linux/time_namespace.h>
 
 #include <linux/uaccess.h>
 #include <asm/mmu_context.h>
@@ -1297,6 +1298,10 @@ int begin_new_exec(struct linux_binprm * bprm)
 
 	bprm->mm = NULL;
 
+	retval = exec_task_namespaces();
+	if (retval)
+		goto out_unlock;
+
 #ifdef CONFIG_POSIX_TIMERS
 	spin_lock_irq(&me->sighand->siglock);
 	posix_cpu_timers_exit(me);
-- 
cgit 


From 275498a98b1fe77deebddfc4f8986c0cf2c3ced7 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 18 Oct 2022 00:17:24 -0700
Subject: exec: Add comments on check_unsafe_exec() fs counting

Add some comments about what the fs counting is doing in
check_unsafe_exec() and how it relates to the call graph.
Specifically, we can't force an unshare of the fs because
of at least Chrome:
https://lore.kernel.org/lkml/86CE201B-5632-4BB7-BCF6-7CB2C2895409@chromium.org/

Cc: Eric Biederman <ebiederm@xmission.com>
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Link: https://lore.kernel.org/r/20221018071537.never.662-kees@kernel.org
---
 fs/exec.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 34e6a2e53268..b3e87bd50962 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1573,6 +1573,12 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
 	if (task_no_new_privs(current))
 		bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;
 
+	/*
+	 * If another task is sharing our fs, we cannot safely
+	 * suid exec because the differently privileged task
+	 * will be able to manipulate the current directory, etc.
+	 * It would be nice to force an unshare instead...
+	 */
 	t = p;
 	n_fs = 1;
 	spin_lock(&p->fs->lock);
@@ -1753,6 +1759,7 @@ static int search_binary_handler(struct linux_binprm *bprm)
 	return retval;
 }
 
+/* binfmt handlers will call back into begin_new_exec() on success. */
 static int exec_binprm(struct linux_binprm *bprm)
 {
 	pid_t old_pid, old_vpid;
@@ -1811,6 +1818,11 @@ static int bprm_execve(struct linux_binprm *bprm,
 	if (retval)
 		return retval;
 
+	/*
+	 * Check for unsafe execution states before exec_binprm(), which
+	 * will call back into begin_new_exec(), into bprm_creds_from_file(),
+	 * where setuid-ness is evaluated.
+	 */
 	check_unsafe_exec(bprm);
 	current->in_execve = 1;
 
-- 
cgit 


From 8f6e3f9e5a0f58e458a348b7e36af11d0e9702af Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 18 Oct 2022 00:14:20 -0700
Subject: binfmt: Fix whitespace issues

Fix the annoying whitespace issues that have been following these files
around for years.

Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Link: https://lore.kernel.org/r/20221018071350.never.230-kees@kernel.org
---
 fs/binfmt_elf.c       | 14 +++++++-------
 fs/binfmt_elf_fdpic.c |  2 +-
 fs/exec.c             |  2 +-
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 6a11025e5850..38a9496f83f3 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -248,7 +248,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
 	} while (0)
 
 #ifdef ARCH_DLINFO
-	/* 
+	/*
 	 * ARCH_DLINFO must come first so PPC can do its special alignment of
 	 * AUXV.
 	 * update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT() in
@@ -1020,7 +1020,7 @@ out_free_interp:
 				 executable_stack);
 	if (retval < 0)
 		goto out_free_dentry;
-	
+
 	elf_bss = 0;
 	elf_brk = 0;
 
@@ -1043,7 +1043,7 @@ out_free_interp:
 
 		if (unlikely (elf_brk > elf_bss)) {
 			unsigned long nbyte;
-	            
+
 			/* There was a PT_LOAD segment with p_memsz > p_filesz
 			   before this one. Map anonymous pages, if needed,
 			   and clear the area.  */
@@ -1521,7 +1521,7 @@ static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset)
 	phdr->p_align = 0;
 }
 
-static void fill_note(struct memelfnote *note, const char *name, int type, 
+static void fill_note(struct memelfnote *note, const char *name, int type,
 		unsigned int sz, void *data)
 {
 	note->name = name;
@@ -2004,8 +2004,8 @@ static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
 	t->num_notes = 0;
 
 	fill_prstatus(&t->prstatus.common, p, signr);
-	elf_core_copy_task_regs(p, &t->prstatus.pr_reg);	
-	
+	elf_core_copy_task_regs(p, &t->prstatus.pr_reg);
+
 	fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus),
 		  &(t->prstatus));
 	t->num_notes++;
@@ -2295,7 +2295,7 @@ static int elf_core_dump(struct coredump_params *cprm)
 	if (!elf_core_write_extra_phdrs(cprm, offset))
 		goto end_coredump;
 
- 	/* write out the notes section */
+	/* write out the notes section */
 	if (!write_note_info(&info, cprm))
 		goto end_coredump;
 
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 08d0c8797828..e90c1192dec6 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1603,7 +1603,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 	if (!elf_core_write_extra_phdrs(cprm, offset))
 		goto end_coredump;
 
- 	/* write out the notes section */
+	/* write out the notes section */
 	if (!writenote(thread_list->notes, cprm))
 		goto end_coredump;
 	if (!writenote(&psinfo_note, cprm))
diff --git a/fs/exec.c b/fs/exec.c
index b3e87bd50962..1644bf10fb9d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -172,7 +172,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
 exit:
 	fput(file);
 out:
-  	return error;
+	return error;
 }
 #endif /* #ifdef CONFIG_USELIB */
 
-- 
cgit 


From bfb4a2b95875a47a01234f2de113ec089d524e71 Mon Sep 17 00:00:00 2001
From: Rolf Eike Beer <eb@emlix.com>
Date: Wed, 19 Oct 2022 09:32:35 +0200
Subject: exec: simplify initial stack size expansion

I had a hard time trying to understand completely why it is using vm_end in
one side of the expression and vm_start in the other one, and using
something in the "if" clause that is not an exact copy of what is used
below. The whole point is that the stack_size variable that was used in the
"if" clause is the difference between vm_start and vm_end, which is not far
away but makes this thing harder to read than it must be.

Signed-off-by: Rolf Eike Beer <eb@emlix.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/2017429.gqNitNVd0C@mobilepool36.emlix.com
---
 fs/exec.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 1644bf10fb9d..9585bc1bc970 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -843,16 +843,13 @@ int setup_arg_pages(struct linux_binprm *bprm,
 	 * will align it up.
 	 */
 	rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK;
+
+	stack_expand = min(rlim_stack, stack_size + stack_expand);
+
 #ifdef CONFIG_STACK_GROWSUP
-	if (stack_size + stack_expand > rlim_stack)
-		stack_base = vma->vm_start + rlim_stack;
-	else
-		stack_base = vma->vm_end + stack_expand;
+	stack_base = vma->vm_start + stack_expand;
 #else
-	if (stack_size + stack_expand > rlim_stack)
-		stack_base = vma->vm_end - rlim_stack;
-	else
-		stack_base = vma->vm_start - stack_expand;
+	stack_base = vma->vm_end - stack_expand;
 #endif
 	current->mm->start_stack = bprm->p;
 	ret = expand_stack(vma, stack_base);
-- 
cgit 


From cfc46ca4fdcaef6f0215f56d89162400aee402e3 Mon Sep 17 00:00:00 2001
From: Rolf Eike Beer <eb@emlix.com>
Date: Wed, 19 Oct 2022 09:43:01 +0200
Subject: binfmt_elf: fix documented return value for load_elf_phdrs()

This function has never returned anything but a plain NULL.

Fixes: 6a8d38945cf4 ("binfmt_elf: Hoist ELF program header loading to a function")
Signed-off-by: Rolf Eike Beer <eb@emlix.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/2359389.EDbqzprbEW@mobilepool36.emlix.com
---
 fs/binfmt_elf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 38a9496f83f3..71df1a3c2dac 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -456,7 +456,7 @@ static unsigned long maximum_alignment(struct elf_phdr *cmds, int nr)
  *
  * Loads ELF program headers from the binary file elf_file, which has the ELF
  * header pointed to by elf_ex, into a newly allocated array. The caller is
- * responsible for freeing the allocated data. Returns an ERR_PTR upon failure.
+ * responsible for freeing the allocated data. Returns NULL upon failure.
  */
 static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex,
 				       struct file *elf_file)
-- 
cgit 


From ef20c5139c3157b5c6eda46f496952bddffe9ad6 Mon Sep 17 00:00:00 2001
From: Rolf Eike Beer <eb@emlix.com>
Date: Wed, 19 Oct 2022 09:52:16 +0200
Subject: binfmt_elf: simplify error handling in load_elf_phdrs()

The err variable was the same like retval, but capped to <= 0. This is the
same as retval as elf_read() never returns positive values.

Signed-off-by: Rolf Eike Beer <eb@emlix.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/4137126.7Qn9TF0dmF@mobilepool36.emlix.com
---
 fs/binfmt_elf.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 71df1a3c2dac..528e2ac8931f 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -462,7 +462,7 @@ static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex,
 				       struct file *elf_file)
 {
 	struct elf_phdr *elf_phdata = NULL;
-	int retval, err = -1;
+	int retval = -1;
 	unsigned int size;
 
 	/*
@@ -484,15 +484,9 @@ static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex,
 
 	/* Read in the program headers */
 	retval = elf_read(elf_file, elf_phdata, size, elf_ex->e_phoff);
-	if (retval < 0) {
-		err = retval;
-		goto out;
-	}
 
-	/* Success! */
-	err = 0;
 out:
-	if (err) {
+	if (retval) {
 		kfree(elf_phdata);
 		elf_phdata = NULL;
 	}
-- 
cgit 


From a2bd096fb2d7f50fb4db246b33e7bfcf5e2eda3a Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 22 Jun 2022 22:12:16 +0200
Subject: fs: use type safe idmapping helpers

We already ported most parts and filesystems over for v6.0 to the new
vfs{g,u}id_t type and associated helpers for v6.0. Convert the remaining
places so we can remove all the old helpers.
This is a non-functional change.

Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/coredump.c    |  4 ++--
 fs/exec.c        | 16 ++++++++--------
 fs/inode.c       |  8 ++++----
 fs/namei.c       | 40 ++++++++++++++++++++--------------------
 fs/remap_range.c |  2 +-
 fs/stat.c        |  7 +++++--
 6 files changed, 40 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/coredump.c b/fs/coredump.c
index 7bad7785e8e6..a133103eb721 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -716,8 +716,8 @@ void do_coredump(const kernel_siginfo_t *siginfo)
 		 * filesystem.
 		 */
 		mnt_userns = file_mnt_user_ns(cprm.file);
-		if (!uid_eq(i_uid_into_mnt(mnt_userns, inode),
-			    current_fsuid())) {
+		if (!vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode),
+				    current_fsuid())) {
 			pr_info_ratelimited("Core dump to %s aborted: cannot preserve file owner\n",
 					    cn.corename);
 			goto close_fail;
diff --git a/fs/exec.c b/fs/exec.c
index 349a5da91efe..dd91adec7a11 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1591,8 +1591,8 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
 	struct user_namespace *mnt_userns;
 	struct inode *inode = file_inode(file);
 	unsigned int mode;
-	kuid_t uid;
-	kgid_t gid;
+	vfsuid_t vfsuid;
+	vfsgid_t vfsgid;
 
 	if (!mnt_may_suid(file->f_path.mnt))
 		return;
@@ -1611,23 +1611,23 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
 
 	/* reload atomically mode/uid/gid now that lock held */
 	mode = inode->i_mode;
-	uid = i_uid_into_mnt(mnt_userns, inode);
-	gid = i_gid_into_mnt(mnt_userns, inode);
+	vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+	vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
 	inode_unlock(inode);
 
 	/* We ignore suid/sgid if there are no mappings for them in the ns */
-	if (!kuid_has_mapping(bprm->cred->user_ns, uid) ||
-		 !kgid_has_mapping(bprm->cred->user_ns, gid))
+	if (!vfsuid_has_mapping(bprm->cred->user_ns, vfsuid) ||
+	    !vfsgid_has_mapping(bprm->cred->user_ns, vfsgid))
 		return;
 
 	if (mode & S_ISUID) {
 		bprm->per_clear |= PER_CLEAR_ON_SETID;
-		bprm->cred->euid = uid;
+		bprm->cred->euid = vfsuid_into_kuid(vfsuid);
 	}
 
 	if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
 		bprm->per_clear |= PER_CLEAR_ON_SETID;
-		bprm->cred->egid = gid;
+		bprm->cred->egid = vfsgid_into_kgid(vfsgid);
 	}
 }
 
diff --git a/fs/inode.c b/fs/inode.c
index 8c4078889754..757cac29bd5a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2326,15 +2326,15 @@ EXPORT_SYMBOL(inode_init_owner);
 bool inode_owner_or_capable(struct user_namespace *mnt_userns,
 			    const struct inode *inode)
 {
-	kuid_t i_uid;
+	vfsuid_t vfsuid;
 	struct user_namespace *ns;
 
-	i_uid = i_uid_into_mnt(mnt_userns, inode);
-	if (uid_eq(current_fsuid(), i_uid))
+	vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+	if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
 		return true;
 
 	ns = current_user_ns();
-	if (kuid_has_mapping(ns, i_uid) && ns_capable(ns, CAP_FOWNER))
+	if (vfsuid_has_mapping(ns, vfsuid) && ns_capable(ns, CAP_FOWNER))
 		return true;
 	return false;
 }
diff --git a/fs/namei.c b/fs/namei.c
index 578c2110df02..d5c5cb7dd023 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -336,11 +336,11 @@ static int acl_permission_check(struct user_namespace *mnt_userns,
 				struct inode *inode, int mask)
 {
 	unsigned int mode = inode->i_mode;
-	kuid_t i_uid;
+	vfsuid_t vfsuid;
 
 	/* Are we the owner? If so, ACL's don't matter */
-	i_uid = i_uid_into_mnt(mnt_userns, inode);
-	if (likely(uid_eq(current_fsuid(), i_uid))) {
+	vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+	if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) {
 		mask &= 7;
 		mode >>= 6;
 		return (mask & ~mode) ? -EACCES : 0;
@@ -362,8 +362,8 @@ static int acl_permission_check(struct user_namespace *mnt_userns,
 	 * about? Need to check group ownership if so.
 	 */
 	if (mask & (mode ^ (mode >> 3))) {
-		kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
-		if (in_group_p(kgid))
+		vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
+		if (vfsgid_in_group_p(vfsgid))
 			mode >>= 3;
 	}
 
@@ -581,7 +581,7 @@ struct nameidata {
 	struct nameidata *saved;
 	unsigned	root_seq;
 	int		dfd;
-	kuid_t		dir_uid;
+	vfsuid_t	dir_vfsuid;
 	umode_t		dir_mode;
 } __randomize_layout;
 
@@ -1095,15 +1095,15 @@ fs_initcall(init_fs_namei_sysctls);
 static inline int may_follow_link(struct nameidata *nd, const struct inode *inode)
 {
 	struct user_namespace *mnt_userns;
-	kuid_t i_uid;
+	vfsuid_t vfsuid;
 
 	if (!sysctl_protected_symlinks)
 		return 0;
 
 	mnt_userns = mnt_user_ns(nd->path.mnt);
-	i_uid = i_uid_into_mnt(mnt_userns, inode);
+	vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
 	/* Allowed if owner and follower match. */
-	if (uid_eq(current_cred()->fsuid, i_uid))
+	if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
 		return 0;
 
 	/* Allowed if parent directory not sticky and world-writable. */
@@ -1111,7 +1111,7 @@ static inline int may_follow_link(struct nameidata *nd, const struct inode *inod
 		return 0;
 
 	/* Allowed if parent directory and link owner match. */
-	if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, i_uid))
+	if (vfsuid_valid(nd->dir_vfsuid) && vfsuid_eq(nd->dir_vfsuid, vfsuid))
 		return 0;
 
 	if (nd->flags & LOOKUP_RCU)
@@ -1183,8 +1183,8 @@ int may_linkat(struct user_namespace *mnt_userns, const struct path *link)
 	struct inode *inode = link->dentry->d_inode;
 
 	/* Inode writeback is not safe when the uid or gid are invalid. */
-	if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) ||
-	    !gid_valid(i_gid_into_mnt(mnt_userns, inode)))
+	if (!vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode)) ||
+	    !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode)))
 		return -EOVERFLOW;
 
 	if (!sysctl_protected_hardlinks)
@@ -1232,13 +1232,13 @@ static int may_create_in_sticky(struct user_namespace *mnt_userns,
 				struct nameidata *nd, struct inode *const inode)
 {
 	umode_t dir_mode = nd->dir_mode;
-	kuid_t dir_uid = nd->dir_uid;
+	vfsuid_t dir_vfsuid = nd->dir_vfsuid;
 
 	if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) ||
 	    (!sysctl_protected_regular && S_ISREG(inode->i_mode)) ||
 	    likely(!(dir_mode & S_ISVTX)) ||
-	    uid_eq(i_uid_into_mnt(mnt_userns, inode), dir_uid) ||
-	    uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)))
+	    vfsuid_eq(i_uid_into_vfsuid(mnt_userns, inode), dir_vfsuid) ||
+	    vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), current_fsuid()))
 		return 0;
 
 	if (likely(dir_mode & 0002) ||
@@ -2307,7 +2307,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 OK:
 			/* pathname or trailing symlink, done */
 			if (!depth) {
-				nd->dir_uid = i_uid_into_mnt(mnt_userns, nd->inode);
+				nd->dir_vfsuid = i_uid_into_vfsuid(mnt_userns, nd->inode);
 				nd->dir_mode = nd->inode->i_mode;
 				nd->flags &= ~LOOKUP_PARENT;
 				return 0;
@@ -2885,9 +2885,9 @@ int __check_sticky(struct user_namespace *mnt_userns, struct inode *dir,
 {
 	kuid_t fsuid = current_fsuid();
 
-	if (uid_eq(i_uid_into_mnt(mnt_userns, inode), fsuid))
+	if (vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), fsuid))
 		return 0;
-	if (uid_eq(i_uid_into_mnt(mnt_userns, dir), fsuid))
+	if (vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, dir), fsuid))
 		return 0;
 	return !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FOWNER);
 }
@@ -2926,8 +2926,8 @@ static int may_delete(struct user_namespace *mnt_userns, struct inode *dir,
 	BUG_ON(victim->d_parent->d_inode != dir);
 
 	/* Inode writeback is not safe when the uid or gid are invalid. */
-	if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) ||
-	    !gid_valid(i_gid_into_mnt(mnt_userns, inode)))
+	if (!vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode)) ||
+	    !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode)))
 		return -EOVERFLOW;
 
 	audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
diff --git a/fs/remap_range.c b/fs/remap_range.c
index 654912d06862..290743c8d226 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -429,7 +429,7 @@ static bool allow_file_dedupe(struct file *file)
 		return true;
 	if (file->f_mode & FMODE_WRITE)
 		return true;
-	if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)))
+	if (vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), current_fsuid()))
 		return true;
 	if (!inode_permission(mnt_userns, inode, MAY_WRITE))
 		return true;
diff --git a/fs/stat.c b/fs/stat.c
index ef50573c72a2..d6cc74ca8486 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -44,12 +44,15 @@
 void generic_fillattr(struct user_namespace *mnt_userns, struct inode *inode,
 		      struct kstat *stat)
 {
+	vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+	vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
+
 	stat->dev = inode->i_sb->s_dev;
 	stat->ino = inode->i_ino;
 	stat->mode = inode->i_mode;
 	stat->nlink = inode->i_nlink;
-	stat->uid = i_uid_into_mnt(mnt_userns, inode);
-	stat->gid = i_gid_into_mnt(mnt_userns, inode);
+	stat->uid = vfsuid_into_kuid(vfsuid);
+	stat->gid = vfsgid_into_kgid(vfsgid);
 	stat->rdev = inode->i_rdev;
 	stat->size = i_size_read(inode);
 	stat->atime = inode->i_atime;
-- 
cgit 


From a03a972b26da2d30d1f3b3a72963191cd2938835 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 9 Sep 2022 11:40:21 +0200
Subject: fuse: port to vfs{g,u}id_t and associated helpers

A while ago we introduced a dedicated vfs{g,u}id_t type in commit
1e5267cd0895 ("mnt_idmapping: add vfs{g,u}id_t").  We already switched over
a good part of the VFS.  Ultimately we will remove all legacy idmapped
mount helpers that operate only on k{g,u}id_t in favor of the new type safe
helpers that operate on vfs{g,u}id_t.

Cc: Christoph Hellwig <hch@lst.de>
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/fuse/acl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c
index 337cb29a8dd5..84c1ca4bc1dc 100644
--- a/fs/fuse/acl.c
+++ b/fs/fuse/acl.c
@@ -98,7 +98,7 @@ int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
 			return ret;
 		}
 
-		if (!in_group_p(i_gid_into_mnt(&init_user_ns, inode)) &&
+		if (!vfsgid_in_group_p(i_gid_into_vfsgid(&init_user_ns, inode)) &&
 		    !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID))
 			extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID;
 
-- 
cgit 


From c12db92d62bf8cd4532ab4e572be0926d3a375be Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 9 Sep 2022 11:07:47 +0200
Subject: ovl: port to vfs{g,u}id_t and associated helpers

A while ago we introduced a dedicated vfs{g,u}id_t type in commit
1e5267cd0895 ("mnt_idmapping: add vfs{g,u}id_t"). We already switched
over a good part of the VFS. Ultimately we will remove all legacy
idmapped mount helpers that operate only on k{g,u}id_t in favor of the
new type safe helpers that operate on vfs{g,u}id_t.

Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/overlayfs/util.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 81a57a8d80d9..c0c20d33691b 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -1104,13 +1104,18 @@ void ovl_copyattr(struct inode *inode)
 	struct path realpath;
 	struct inode *realinode;
 	struct user_namespace *real_mnt_userns;
+	vfsuid_t vfsuid;
+	vfsgid_t vfsgid;
 
 	ovl_i_path_real(inode, &realpath);
 	realinode = d_inode(realpath.dentry);
 	real_mnt_userns = mnt_user_ns(realpath.mnt);
 
-	inode->i_uid = i_uid_into_mnt(real_mnt_userns, realinode);
-	inode->i_gid = i_gid_into_mnt(real_mnt_userns, realinode);
+	vfsuid = i_uid_into_vfsuid(real_mnt_userns, realinode);
+	vfsgid = i_gid_into_vfsgid(real_mnt_userns, realinode);
+
+	inode->i_uid = vfsuid_into_kuid(vfsuid);
+	inode->i_gid = vfsgid_into_kgid(vfsgid);
 	inode->i_mode = realinode->i_mode;
 	inode->i_atime = realinode->i_atime;
 	inode->i_mtime = realinode->i_mtime;
-- 
cgit 


From ebe060369f8d6e4588b115f252bebf5ba4d64350 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Sat, 22 Oct 2022 21:39:14 +0100
Subject: jfs: Fix fortify moan in symlink

JFS has in jfs_incore.h:

      /* _inline may overflow into _inline_ea when needed */
      /* _inline_ea may overlay the last part of
       * file._xtroot if maxentry = XTROOTINITSLOT
       */
      union {
        struct {
          /* 128: inline symlink */
          unchar _inline[128];
          /* 128: inline extended attr */
          unchar _inline_ea[128];
        };
        unchar _inline_all[256];

and currently the symlink code copies into _inline;
if this is larger than 128 bytes it triggers a fortify warning of the
form:

  memcpy: detected field-spanning write (size 132) of single field
     "ip->i_link" at fs/jfs/namei.c:950 (size 18446744073709551615)

when it's actually OK.

Copy it into _inline_all instead.

Reported-by: syzbot+5fc38b2ddbbca7f5c680@syzkaller.appspotmail.com
Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Dave Kleikamp <dave.kleikamp@oracle.com>
---
 fs/jfs/namei.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 9db4f5789c0e..4fbbf88435e6 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -946,7 +946,7 @@ static int jfs_symlink(struct user_namespace *mnt_userns, struct inode *dip,
 	if (ssize <= IDATASIZE) {
 		ip->i_op = &jfs_fast_symlink_inode_operations;
 
-		ip->i_link = JFS_IP(ip)->i_inline;
+		ip->i_link = JFS_IP(ip)->i_inline_all;
 		memcpy(ip->i_link, name, ssize);
 		ip->i_size = ssize - 1;
 
-- 
cgit 


From 25e70c6162f207828dd405b432d8f2a98dbf7082 Mon Sep 17 00:00:00 2001
From: Hoi Pok Wu <wuhoipok@gmail.com>
Date: Tue, 25 Oct 2022 23:20:45 +0800
Subject: fs: jfs: fix shift-out-of-bounds in dbDiscardAG

This should be applied to most URSAN bugs found recently by syzbot,
by guarding the dbMount. As syzbot feeding rubbish into the bmap
descriptor.

Signed-off-by: Hoi Pok Wu <wuhoipok@gmail.com>
Signed-off-by: Dave Kleikamp <dave.kleikamp@oracle.com>
---
 fs/jfs/jfs_dmap.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index e1cbfbb60303..765838578a72 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -198,6 +198,11 @@ int dbMount(struct inode *ipbmap)
 		goto err_release_metapage;
 	}
 
+	if (((bmp->db_mapsize - 1) >> bmp->db_agl2size) > MAXAG) {
+		err = -EINVAL;
+		goto err_release_metapage;
+	}
+
 	for (i = 0; i < MAXAG; i++)
 		bmp->db_agfree[i] = le64_to_cpu(dbmp_le->dn_agfree[i]);
 	bmp->db_agsize = le64_to_cpu(dbmp_le->dn_agsize);
-- 
cgit 


From 28fc4e9077ce59ab28c89c20dc6be5154473218f Mon Sep 17 00:00:00 2001
From: Zhang Qilong <zhangqilong3@huawei.com>
Date: Tue, 18 Oct 2022 10:45:32 +0800
Subject: f2fs: Fix the race condition of resize flag between resizefs

Because the set/clear SBI_IS_RESIZEFS flag not between any locks,
In the following case:
  thread1			thread2
   ->ioctl(resizefs)
    ->set RESIZEFS flag		 ->ioctl(resizefs)
    ...                   	  ->set RESIZEFS flag
    ->clear RESIZEFS flag
    				  ->resizefs stream
				    # No RESIZEFS flag in the stream

Also before freeze_super, the resizefs not started, we should not set
the SBI_IS_RESIZEFS flag.

So move the set/clear SBI_IS_RESIZEFS flag between the cp_mutex and
gc_lock.

Fixes: b4b10061ef98 ("f2fs: refactor resize_fs to avoid meta updates in progress")
Signed-off-by: Zhang Xiaoxu <zhangxiaoxu5@huawei.com>
Signed-off-by: Zhang Qilong <zhangqilong3@huawei.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/gc.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index dab794225cce..7b4be412cec0 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -2134,8 +2134,6 @@ out_unlock:
 	if (err)
 		return err;
 
-	set_sbi_flag(sbi, SBI_IS_RESIZEFS);
-
 	freeze_super(sbi->sb);
 	f2fs_down_write(&sbi->gc_lock);
 	f2fs_down_write(&sbi->cp_global_sem);
@@ -2151,6 +2149,7 @@ out_unlock:
 	if (err)
 		goto out_err;
 
+	set_sbi_flag(sbi, SBI_IS_RESIZEFS);
 	err = free_segment_range(sbi, secs, false);
 	if (err)
 		goto recover_out;
@@ -2174,6 +2173,7 @@ out_unlock:
 		f2fs_commit_super(sbi, false);
 	}
 recover_out:
+	clear_sbi_flag(sbi, SBI_IS_RESIZEFS);
 	if (err) {
 		set_sbi_flag(sbi, SBI_NEED_FSCK);
 		f2fs_err(sbi, "resize_fs failed, should run fsck to repair!");
@@ -2186,6 +2186,5 @@ out_err:
 	f2fs_up_write(&sbi->cp_global_sem);
 	f2fs_up_write(&sbi->gc_lock);
 	thaw_super(sbi->sb);
-	clear_sbi_flag(sbi, SBI_IS_RESIZEFS);
 	return err;
 }
-- 
cgit 


From a351b1f444187312bb42479cb26e82f26fc481d2 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 28 Oct 2022 10:00:26 +0200
Subject: acl: make vfs_posix_acl_to_xattr() static

After reworking posix acls this helper isn't used anywhere outside the core
posix acl paths. Make it static.

Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/posix_acl.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index acb7f7d7f11d..989bbf280bfe 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -884,9 +884,10 @@ EXPORT_SYMBOL (posix_acl_to_xattr);
  * Return: On success, the size of the stored uapi posix acls, on error a
  * negative errno.
  */
-ssize_t vfs_posix_acl_to_xattr(struct user_namespace *mnt_userns,
-			       struct inode *inode, const struct posix_acl *acl,
-			       void *buffer, size_t size)
+static ssize_t vfs_posix_acl_to_xattr(struct user_namespace *mnt_userns,
+				      struct inode *inode,
+				      const struct posix_acl *acl, void *buffer,
+				      size_t size)
 
 {
 	struct posix_acl_xattr_header *ext_acl = buffer;
-- 
cgit 


From cb2144d66b0b24fd1b880fc72678ba21ca414dab Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 28 Oct 2022 12:45:10 +0200
Subject: cifs: check whether acl is valid early

Dan reported that acl is dereferenced before being checked and this is a
valid problem. Fix it be erroring out early instead of doing it later after
we've already relied on acl to be a valid pointer.

Fixes: dc1af4c4b472 ("cifs: implement set acl method")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/cifs/cifsacl.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 6a9f03c882dc..c647f0d56518 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -1764,6 +1764,10 @@ int cifs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 		rc = PTR_ERR(full_path);
 		goto out;
 	}
+
+	if (!acl)
+		goto out;
+
 	/* return dos attributes as pseudo xattr */
 	/* return alt name if available as pseudo attr */
 
@@ -1778,8 +1782,6 @@ int cifs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 
 	switch (type) {
 	case ACL_TYPE_ACCESS:
-		if (!acl)
-			goto out;
 		if (sb->s_flags & SB_POSIXACL)
 			rc = cifs_do_set_acl(xid, pTcon, full_path, acl,
 					     ACL_TYPE_ACCESS,
@@ -1788,8 +1790,6 @@ int cifs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 		break;
 
 	case ACL_TYPE_DEFAULT:
-		if (!acl)
-			goto out;
 		if (sb->s_flags & SB_POSIXACL)
 			rc = cifs_do_set_acl(xid, pTcon, full_path, acl,
 					     ACL_TYPE_DEFAULT,
-- 
cgit 


From 256c8aed2b420a7c57ed6469fbb0f8310f5aeec9 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 26 Oct 2022 12:51:27 +0200
Subject: fs: introduce dedicated idmap type for mounts

Last cycle we've already made the interaction with idmapped mounts more
robust and type safe by introducing the vfs{g,u}id_t type. This cycle we
concluded the conversion and removed the legacy helpers.

Currently we still pass around the plain namespace that was attached to
a mount. This is in general pretty convenient but it makes it easy to
conflate filesystem and mount namespaces and what different roles they
have to play. Especially for filesystem developers without much
experience in this area this is an easy source for bugs.

Instead of passing the plain namespace we introduce a dedicated type
struct mnt_idmap and replace the pointer with a pointer to a struct
mnt_idmap. There are no semantic or size changes for the mount struct
caused by this.

We then start converting all places aware of idmapped mounts to rely on
struct mnt_idmap. Once the conversion is done all helpers down to the
really low-level make_vfs{g,u}id() and from_vfs{g,u}id() will take a
struct mnt_idmap argument instead of two namespace arguments. This way
it becomes impossible to conflate the two, removing and thus eliminating
the possibility of any bugs. Fwiw, I fixed some issues in that area a
while ago in ntfs3 and ksmbd in the past. Afterwards, only low-level
code can ultimately use the associated namespace for any permission
checks. Even most of the vfs can be ultimately completely oblivious
about this and filesystems will never interact with it directly in any
form in the future.

A struct mnt_idmap currently encompasses a simple refcount and a pointer
to the relevant namespace the mount is idmapped to. If a mount isn't
idmapped then it will point to a static nop_mnt_idmap. If it is an
idmapped mount it will point to a new struct mnt_idmap. As usual there
are no allocations or anything happening for non-idmapped mounts.
Everthing is carefully written to be a nop for non-idmapped mounts as
has always been the case.

If an idmapped mount or mount tree is created a new struct mnt_idmap is
allocated and a reference taken on the relevant namespace. For each
mount in a mount tree that gets idmapped or a mount that inherits the
idmap when it is cloned the reference count on the associated struct
mnt_idmap is bumped. Just a reminder that we only allow a mount to
change it's idmapping a single time and only if it hasn't already been
attached to the filesystems and has no active writers.

The actual changes are fairly straightforward. This will have huge
benefits for maintenance and security in the long run even if it causes
some churn. I'm aware that there's some cost for all of you. And I'll
commit to doing this work and make this as painless as I can.

Note that this also makes it possible to extend struct mount_idmap in
the future. For example, it would be possible to place the namespace
pointer in an anonymous union together with an idmapping struct. This
would allow us to expose an api to userspace that would let it specify
idmappings directly instead of having to go through the detour of
setting up namespaces at all.

This just adds the infrastructure and doesn't do any conversions.

Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/namespace.c | 176 ++++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 142 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index df137ba19d37..e276c0e7ecea 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -75,6 +75,22 @@ static DECLARE_RWSEM(namespace_sem);
 static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
 static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
 
+struct mnt_idmap {
+	struct user_namespace *owner;
+	refcount_t count;
+};
+
+/*
+ * Carries the initial idmapping of 0:0:4294967295 which is an identity
+ * mapping. This means that {g,u}id 0 is mapped to {g,u}id 0, {g,u}id 1 is
+ * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...].
+ */
+struct mnt_idmap nop_mnt_idmap = {
+	.owner	= &init_user_ns,
+	.count	= REFCOUNT_INIT(1),
+};
+EXPORT_SYMBOL_GPL(nop_mnt_idmap);
+
 struct mount_kattr {
 	unsigned int attr_set;
 	unsigned int attr_clr;
@@ -82,6 +98,7 @@ struct mount_kattr {
 	unsigned int lookup_flags;
 	bool recurse;
 	struct user_namespace *mnt_userns;
+	struct mnt_idmap *mnt_idmap;
 };
 
 /* /sys/fs */
@@ -193,6 +210,104 @@ int mnt_get_count(struct mount *mnt)
 #endif
 }
 
+/**
+ * mnt_idmap_owner - retrieve owner of the mount's idmapping
+ * @idmap: mount idmapping
+ *
+ * This helper will go away once the conversion to use struct mnt_idmap
+ * everywhere has finished at which point the helper will be unexported.
+ *
+ * Only code that needs to perform permission checks based on the owner of the
+ * idmapping will get access to it. All other code will solely rely on
+ * idmappings. This will get us type safety so it's impossible to conflate
+ * filesystems idmappings with mount idmappings.
+ *
+ * Return: The owner of the idmapping.
+ */
+struct user_namespace *mnt_idmap_owner(const struct mnt_idmap *idmap)
+{
+	return idmap->owner;
+}
+EXPORT_SYMBOL_GPL(mnt_idmap_owner);
+
+/**
+ * mnt_user_ns - retrieve owner of an idmapped mount
+ * @mnt: the relevant vfsmount
+ *
+ * This helper will go away once the conversion to use struct mnt_idmap
+ * everywhere has finished at which point the helper will be unexported.
+ *
+ * Only code that needs to perform permission checks based on the owner of the
+ * idmapping will get access to it. All other code will solely rely on
+ * idmappings. This will get us type safety so it's impossible to conflate
+ * filesystems idmappings with mount idmappings.
+ *
+ * Return: The owner of the idmapped.
+ */
+struct user_namespace *mnt_user_ns(const struct vfsmount *mnt)
+{
+	struct mnt_idmap *idmap = mnt_idmap(mnt);
+
+	/* Return the actual owner of the filesystem instead of the nop. */
+	if (idmap == &nop_mnt_idmap &&
+	    !initial_idmapping(mnt->mnt_sb->s_user_ns))
+		return mnt->mnt_sb->s_user_ns;
+	return mnt_idmap_owner(idmap);
+}
+EXPORT_SYMBOL_GPL(mnt_user_ns);
+
+/**
+ * alloc_mnt_idmap - allocate a new idmapping for the mount
+ * @mnt_userns: owning userns of the idmapping
+ *
+ * Allocate a new struct mnt_idmap which carries the idmapping of the mount.
+ *
+ * Return: On success a new idmap, on error an error pointer is returned.
+ */
+static struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns)
+{
+	struct mnt_idmap *idmap;
+
+	idmap = kzalloc(sizeof(struct mnt_idmap), GFP_KERNEL_ACCOUNT);
+	if (!idmap)
+		return ERR_PTR(-ENOMEM);
+
+	idmap->owner = get_user_ns(mnt_userns);
+	refcount_set(&idmap->count, 1);
+	return idmap;
+}
+
+/**
+ * mnt_idmap_get - get a reference to an idmapping
+ * @idmap: the idmap to bump the reference on
+ *
+ * If @idmap is not the @nop_mnt_idmap bump the reference count.
+ *
+ * Return: @idmap with reference count bumped if @not_mnt_idmap isn't passed.
+ */
+static inline struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap)
+{
+	if (idmap != &nop_mnt_idmap)
+		refcount_inc(&idmap->count);
+
+	return idmap;
+}
+
+/**
+ * mnt_idmap_put - put a reference to an idmapping
+ * @idmap: the idmap to put the reference on
+ *
+ * If this is a non-initial idmapping, put the reference count when a mount is
+ * released and free it if we're the last user.
+ */
+static inline void mnt_idmap_put(struct mnt_idmap *idmap)
+{
+	if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count)) {
+		put_user_ns(idmap->owner);
+		kfree(idmap);
+	}
+}
+
 static struct mount *alloc_vfsmnt(const char *name)
 {
 	struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -232,7 +347,7 @@ static struct mount *alloc_vfsmnt(const char *name)
 		INIT_HLIST_NODE(&mnt->mnt_mp_list);
 		INIT_LIST_HEAD(&mnt->mnt_umounting);
 		INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
-		mnt->mnt.mnt_userns = &init_user_ns;
+		mnt->mnt.mnt_idmap = &nop_mnt_idmap;
 	}
 	return mnt;
 
@@ -602,11 +717,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
 
 static void free_vfsmnt(struct mount *mnt)
 {
-	struct user_namespace *mnt_userns;
-
-	mnt_userns = mnt_user_ns(&mnt->mnt);
-	if (!initial_idmapping(mnt_userns))
-		put_user_ns(mnt_userns);
+	mnt_idmap_put(mnt_idmap(&mnt->mnt));
 	kfree_const(mnt->mnt_devname);
 #ifdef CONFIG_SMP
 	free_percpu(mnt->mnt_pcp);
@@ -1009,7 +1120,6 @@ static struct mount *skip_mnt_tree(struct mount *p)
 struct vfsmount *vfs_create_mount(struct fs_context *fc)
 {
 	struct mount *mnt;
-	struct user_namespace *fs_userns;
 
 	if (!fc->root)
 		return ERR_PTR(-EINVAL);
@@ -1027,10 +1137,6 @@ struct vfsmount *vfs_create_mount(struct fs_context *fc)
 	mnt->mnt_mountpoint	= mnt->mnt.mnt_root;
 	mnt->mnt_parent		= mnt;
 
-	fs_userns = mnt->mnt.mnt_sb->s_user_ns;
-	if (!initial_idmapping(fs_userns))
-		mnt->mnt.mnt_userns = get_user_ns(fs_userns);
-
 	lock_mount_hash();
 	list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
 	unlock_mount_hash();
@@ -1120,9 +1226,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 	mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
 
 	atomic_inc(&sb->s_active);
-	mnt->mnt.mnt_userns = mnt_user_ns(&old->mnt);
-	if (!initial_idmapping(mnt->mnt.mnt_userns))
-		mnt->mnt.mnt_userns = get_user_ns(mnt->mnt.mnt_userns);
+	mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));
+
 	mnt->mnt.mnt_sb = sb;
 	mnt->mnt.mnt_root = dget(root);
 	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
@@ -3981,14 +4086,14 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
 	struct vfsmount *m = &mnt->mnt;
 	struct user_namespace *fs_userns = m->mnt_sb->s_user_ns;
 
-	if (!kattr->mnt_userns)
+	if (!kattr->mnt_idmap)
 		return 0;
 
 	/*
 	 * Creating an idmapped mount with the filesystem wide idmapping
 	 * doesn't make sense so block that. We don't allow mushy semantics.
 	 */
-	if (kattr->mnt_userns == fs_userns)
+	if (mnt_idmap_owner(kattr->mnt_idmap) == fs_userns)
 		return -EINVAL;
 
 	/*
@@ -4028,7 +4133,7 @@ static inline bool mnt_allow_writers(const struct mount_kattr *kattr,
 {
 	return (!(kattr->attr_set & MNT_READONLY) ||
 		(mnt->mnt.mnt_flags & MNT_READONLY)) &&
-	       !kattr->mnt_userns;
+	       !kattr->mnt_idmap;
 }
 
 static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
@@ -4082,27 +4187,18 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
 
 static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
 {
-	struct user_namespace *mnt_userns, *old_mnt_userns;
-
-	if (!kattr->mnt_userns)
+	if (!kattr->mnt_idmap)
 		return;
 
 	/*
-	 * We're the only ones able to change the mount's idmapping. So
-	 * mnt->mnt.mnt_userns is stable and we can retrieve it directly.
-	 */
-	old_mnt_userns = mnt->mnt.mnt_userns;
-
-	mnt_userns = get_user_ns(kattr->mnt_userns);
-	/* Pairs with smp_load_acquire() in mnt_user_ns(). */
-	smp_store_release(&mnt->mnt.mnt_userns, mnt_userns);
-
-	/*
-	 * If this is an idmapped filesystem drop the reference we've taken
-	 * in vfs_create_mount() before.
+	 * Pairs with smp_load_acquire() in mnt_idmap().
+	 *
+	 * Since we only allow a mount to change the idmapping once and
+	 * verified this in can_idmap_mount() we know that the mount has
+	 * @nop_mnt_idmap attached to it. So there's no need to drop any
+	 * references.
 	 */
-	if (!initial_idmapping(old_mnt_userns))
-		put_user_ns(old_mnt_userns);
+	smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap));
 }
 
 static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
@@ -4136,6 +4232,15 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
 	if (path->dentry != mnt->mnt.mnt_root)
 		return -EINVAL;
 
+	if (kattr->mnt_userns) {
+		struct mnt_idmap *mnt_idmap;
+
+		mnt_idmap = alloc_mnt_idmap(kattr->mnt_userns);
+		if (IS_ERR(mnt_idmap))
+			return PTR_ERR(mnt_idmap);
+		kattr->mnt_idmap = mnt_idmap;
+	}
+
 	if (kattr->propagation) {
 		/*
 		 * Only take namespace_lock() if we're actually changing
@@ -4323,6 +4428,9 @@ static void finish_mount_kattr(struct mount_kattr *kattr)
 {
 	put_user_ns(kattr->mnt_userns);
 	kattr->mnt_userns = NULL;
+
+	if (kattr->mnt_idmap)
+		mnt_idmap_put(kattr->mnt_idmap);
 }
 
 SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
-- 
cgit 


From 5a6f52d20ce3cd6d30103a27f18edff337da191b Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 28 Oct 2022 09:56:20 +0200
Subject: acl: conver higher-level helpers to rely on mnt_idmap

Convert an initial portion to rely on struct mnt_idmap by converting the
high level xattr helpers.

Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/internal.h  | 12 ++++++------
 fs/posix_acl.c | 16 +++++++++-------
 fs/xattr.c     | 39 ++++++++++++++++++++-------------------
 3 files changed, 35 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/internal.h b/fs/internal.h
index 0c8812fe7ca4..a803cc3cf716 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -227,28 +227,28 @@ struct xattr_ctx {
 };
 
 
-ssize_t do_getxattr(struct user_namespace *mnt_userns,
+ssize_t do_getxattr(struct mnt_idmap *idmap,
 		    struct dentry *d,
 		    struct xattr_ctx *ctx);
 
 int setxattr_copy(const char __user *name, struct xattr_ctx *ctx);
-int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
 		struct xattr_ctx *ctx);
 int may_write_xattr(struct user_namespace *mnt_userns, struct inode *inode);
 
 #ifdef CONFIG_FS_POSIX_ACL
-int do_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
 	       const char *acl_name, const void *kvalue, size_t size);
-ssize_t do_get_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry,
 		   const char *acl_name, void *kvalue, size_t size);
 #else
-static inline int do_set_acl(struct user_namespace *mnt_userns,
+static inline int do_set_acl(struct mnt_idmap *idmap,
 			     struct dentry *dentry, const char *acl_name,
 			     const void *kvalue, size_t size)
 {
 	return -EOPNOTSUPP;
 }
-static inline ssize_t do_get_acl(struct user_namespace *mnt_userns,
+static inline ssize_t do_get_acl(struct mnt_idmap *idmap,
 				 struct dentry *dentry, const char *acl_name,
 				 void *kvalue, size_t size)
 {
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 989bbf280bfe..7260b59b26a0 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -871,7 +871,7 @@ EXPORT_SYMBOL (posix_acl_to_xattr);
 
 /**
  * vfs_posix_acl_to_xattr - convert from kernel to userspace representation
- * @mnt_userns: user namespace of the mount
+ * @idmap: idmap of the mount
  * @inode: inode the posix acls are set on
  * @acl: the posix acls as represented by the vfs
  * @buffer: the buffer into which to convert @acl
@@ -884,7 +884,7 @@ EXPORT_SYMBOL (posix_acl_to_xattr);
  * Return: On success, the size of the stored uapi posix acls, on error a
  * negative errno.
  */
-static ssize_t vfs_posix_acl_to_xattr(struct user_namespace *mnt_userns,
+static ssize_t vfs_posix_acl_to_xattr(struct mnt_idmap *idmap,
 				      struct inode *inode,
 				      const struct posix_acl *acl, void *buffer,
 				      size_t size)
@@ -893,6 +893,7 @@ static ssize_t vfs_posix_acl_to_xattr(struct user_namespace *mnt_userns,
 	struct posix_acl_xattr_header *ext_acl = buffer;
 	struct posix_acl_xattr_entry *ext_entry;
 	struct user_namespace *fs_userns, *caller_userns;
+	struct user_namespace *mnt_userns;
 	ssize_t real_size, n;
 	vfsuid_t vfsuid;
 	vfsgid_t vfsgid;
@@ -908,6 +909,7 @@ static ssize_t vfs_posix_acl_to_xattr(struct user_namespace *mnt_userns,
 
 	fs_userns = i_user_ns(inode);
 	caller_userns = current_user_ns();
+	mnt_userns = mnt_idmap_owner(idmap);
 	for (n=0; n < acl->a_count; n++, ext_entry++) {
 		const struct posix_acl_entry *acl_e = &acl->a_entries[n];
 		ext_entry->e_tag  = cpu_to_le16(acl_e->e_tag);
@@ -1227,7 +1229,7 @@ out_inode_unlock:
 }
 EXPORT_SYMBOL_GPL(vfs_remove_acl);
 
-int do_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
 	       const char *acl_name, const void *kvalue, size_t size)
 {
 	int error;
@@ -1243,22 +1245,22 @@ int do_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
 			return PTR_ERR(acl);
 	}
 
-	error = vfs_set_acl(mnt_userns, dentry, acl_name, acl);
+	error = vfs_set_acl(mnt_idmap_owner(idmap), dentry, acl_name, acl);
 	posix_acl_release(acl);
 	return error;
 }
 
-ssize_t do_get_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry,
 		   const char *acl_name, void *kvalue, size_t size)
 {
 	ssize_t error;
 	struct posix_acl *acl;
 
-	acl = vfs_get_acl(mnt_userns, dentry, acl_name);
+	acl = vfs_get_acl(mnt_idmap_owner(idmap), dentry, acl_name);
 	if (IS_ERR(acl))
 		return PTR_ERR(acl);
 
-	error = vfs_posix_acl_to_xattr(mnt_userns, d_inode(dentry),
+	error = vfs_posix_acl_to_xattr(idmap, d_inode(dentry),
 				       acl, kvalue, size);
 	posix_acl_release(acl);
 	return error;
diff --git a/fs/xattr.c b/fs/xattr.c
index df3af9fa8c77..3641a0ce5380 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -597,19 +597,19 @@ int setxattr_copy(const char __user *name, struct xattr_ctx *ctx)
 	return error;
 }
 
-int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
 		struct xattr_ctx *ctx)
 {
 	if (is_posix_acl_xattr(ctx->kname->name))
-		return do_set_acl(mnt_userns, dentry, ctx->kname->name,
+		return do_set_acl(idmap, dentry, ctx->kname->name,
 				  ctx->kvalue, ctx->size);
 
-	return vfs_setxattr(mnt_userns, dentry, ctx->kname->name,
+	return vfs_setxattr(mnt_idmap_owner(idmap), dentry, ctx->kname->name,
 			ctx->kvalue, ctx->size, ctx->flags);
 }
 
 static long
-setxattr(struct user_namespace *mnt_userns, struct dentry *d,
+setxattr(struct mnt_idmap *idmap, struct dentry *d,
 	const char __user *name, const void __user *value, size_t size,
 	int flags)
 {
@@ -627,7 +627,7 @@ setxattr(struct user_namespace *mnt_userns, struct dentry *d,
 	if (error)
 		return error;
 
-	error = do_setxattr(mnt_userns, d, &ctx);
+	error = do_setxattr(idmap, d, &ctx);
 
 	kvfree(ctx.kvalue);
 	return error;
@@ -646,7 +646,7 @@ retry:
 		return error;
 	error = mnt_want_write(path.mnt);
 	if (!error) {
-		error = setxattr(mnt_user_ns(path.mnt), path.dentry, name,
+		error = setxattr(mnt_idmap(path.mnt), path.dentry, name,
 				 value, size, flags);
 		mnt_drop_write(path.mnt);
 	}
@@ -683,7 +683,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
 	audit_file(f.file);
 	error = mnt_want_write_file(f.file);
 	if (!error) {
-		error = setxattr(file_mnt_user_ns(f.file),
+		error = setxattr(file_mnt_idmap(f.file),
 				 f.file->f_path.dentry, name,
 				 value, size, flags);
 		mnt_drop_write_file(f.file);
@@ -696,7 +696,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
  * Extended attribute GET operations
  */
 ssize_t
-do_getxattr(struct user_namespace *mnt_userns, struct dentry *d,
+do_getxattr(struct mnt_idmap *idmap, struct dentry *d,
 	struct xattr_ctx *ctx)
 {
 	ssize_t error;
@@ -711,9 +711,10 @@ do_getxattr(struct user_namespace *mnt_userns, struct dentry *d,
 	}
 
 	if (is_posix_acl_xattr(ctx->kname->name))
-		error = do_get_acl(mnt_userns, d, kname, ctx->kvalue, ctx->size);
+		error = do_get_acl(idmap, d, kname, ctx->kvalue, ctx->size);
 	else
-		error = vfs_getxattr(mnt_userns, d, kname, ctx->kvalue, ctx->size);
+		error = vfs_getxattr(mnt_idmap_owner(idmap), d, kname,
+				     ctx->kvalue, ctx->size);
 	if (error > 0) {
 		if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error))
 			error = -EFAULT;
@@ -727,7 +728,7 @@ do_getxattr(struct user_namespace *mnt_userns, struct dentry *d,
 }
 
 static ssize_t
-getxattr(struct user_namespace *mnt_userns, struct dentry *d,
+getxattr(struct mnt_idmap *idmap, struct dentry *d,
 	 const char __user *name, void __user *value, size_t size)
 {
 	ssize_t error;
@@ -746,7 +747,7 @@ getxattr(struct user_namespace *mnt_userns, struct dentry *d,
 	if (error < 0)
 		return error;
 
-	error =  do_getxattr(mnt_userns, d, &ctx);
+	error =  do_getxattr(idmap, d, &ctx);
 
 	kvfree(ctx.kvalue);
 	return error;
@@ -762,7 +763,7 @@ retry:
 	error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
 	if (error)
 		return error;
-	error = getxattr(mnt_user_ns(path.mnt), path.dentry, name, value, size);
+	error = getxattr(mnt_idmap(path.mnt), path.dentry, name, value, size);
 	path_put(&path);
 	if (retry_estale(error, lookup_flags)) {
 		lookup_flags |= LOOKUP_REVAL;
@@ -792,7 +793,7 @@ SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,
 	if (!f.file)
 		return error;
 	audit_file(f.file);
-	error = getxattr(file_mnt_user_ns(f.file), f.file->f_path.dentry,
+	error = getxattr(file_mnt_idmap(f.file), f.file->f_path.dentry,
 			 name, value, size);
 	fdput(f);
 	return error;
@@ -877,7 +878,7 @@ SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
  * Extended attribute REMOVE operations
  */
 static long
-removexattr(struct user_namespace *mnt_userns, struct dentry *d,
+removexattr(struct mnt_idmap *idmap, struct dentry *d,
 	    const char __user *name)
 {
 	int error;
@@ -890,9 +891,9 @@ removexattr(struct user_namespace *mnt_userns, struct dentry *d,
 		return error;
 
 	if (is_posix_acl_xattr(kname))
-		return vfs_remove_acl(mnt_userns, d, kname);
+		return vfs_remove_acl(mnt_idmap_owner(idmap), d, kname);
 
-	return vfs_removexattr(mnt_userns, d, kname);
+	return vfs_removexattr(mnt_idmap_owner(idmap), d, kname);
 }
 
 static int path_removexattr(const char __user *pathname,
@@ -906,7 +907,7 @@ retry:
 		return error;
 	error = mnt_want_write(path.mnt);
 	if (!error) {
-		error = removexattr(mnt_user_ns(path.mnt), path.dentry, name);
+		error = removexattr(mnt_idmap(path.mnt), path.dentry, name);
 		mnt_drop_write(path.mnt);
 	}
 	path_put(&path);
@@ -939,7 +940,7 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
 	audit_file(f.file);
 	error = mnt_want_write_file(f.file);
 	if (!error) {
-		error = removexattr(file_mnt_user_ns(f.file),
+		error = removexattr(file_mnt_idmap(f.file),
 				    f.file->f_path.dentry, name);
 		mnt_drop_write_file(f.file);
 	}
-- 
cgit 


From 7ee47dcfff1835ff75a794d1075b6b5f5462cfed Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Mon, 31 Oct 2022 18:52:56 +0100
Subject: fs: use acquire ordering in __fget_light()

We must prevent the CPU from reordering the files->count read with the
FD table access like this, on architectures where read-read reordering is
possible:

    files_lookup_fd_raw()
                                  close_fd()
                                  put_files_struct()
    atomic_read(&files->count)

I would like to mark this for stable, but the stable rules explicitly say
"no theoretical races", and given that the FD table pointer and
files->count are explicitly stored in the same cacheline, this sort of
reordering seems quite unlikely in practice...

Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/file.c b/fs/file.c
index 5f9c802a5d8d..c942c89ca4cd 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -1003,7 +1003,16 @@ static unsigned long __fget_light(unsigned int fd, fmode_t mask)
 	struct files_struct *files = current->files;
 	struct file *file;
 
-	if (atomic_read(&files->count) == 1) {
+	/*
+	 * If another thread is concurrently calling close_fd() followed
+	 * by put_files_struct(), we must not observe the old table
+	 * entry combined with the new refcount - otherwise we could
+	 * return a file that is concurrently being freed.
+	 *
+	 * atomic_read_acquire() pairs with atomic_dec_and_test() in
+	 * put_files_struct().
+	 */
+	if (atomic_read_acquire(&files->count) == 1) {
 		file = files_lookup_fd_raw(files, fd);
 		if (!file || unlikely(file->f_mode & mask))
 			return 0;
-- 
cgit 


From 5a17f040fa332e71a45ca9ff02d6979d9176a423 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 26 Oct 2022 16:31:11 -0700
Subject: cred: Do not default to init_cred in prepare_kernel_cred()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A common exploit pattern for ROP attacks is to abuse prepare_kernel_cred()
in order to construct escalated privileges[1]. Instead of providing a
short-hand argument (NULL) to the "daemon" argument to indicate using
init_cred as the base cred, require that "daemon" is always set to
an actual task. Replace all existing callers that were passing NULL
with &init_task.

Future attacks will need to have sufficiently powerful read/write
primitives to have found an appropriately privileged task and written it
to the ROP stack as an argument to succeed, which is similarly difficult
to the prior effort needed to escalate privileges before struct cred
existed: locate the current cred and overwrite the uid member.

This has the added benefit of meaning that prepare_kernel_cred() can no
longer exceed the privileges of the init task, which may have changed from
the original init_cred (e.g. dropping capabilities from the bounding set).

[1] https://google.com/search?q=commit_creds(prepare_kernel_cred(0))

Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: David Howells <dhowells@redhat.com>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Steve French <sfrench@samba.org>
Cc: Ronnie Sahlberg <lsahlber@redhat.com>
Cc: Shyam Prasad N <sprasad@microsoft.com>
Cc: Tom Talpey <tom@talpey.com>
Cc: Namjae Jeon <linkinjeon@kernel.org>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Anna Schumaker <anna@kernel.org>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: "Michal Koutný" <mkoutny@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-cifs@vger.kernel.org
Cc: samba-technical@lists.samba.org
Cc: linux-nfs@vger.kernel.org
Cc: netdev@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Acked-by: Russ Weight <russell.h.weight@intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Paulo Alcantara (SUSE) <pc@cjr.nz>
Link: https://lore.kernel.org/r/20221026232943.never.775-kees@kernel.org
---
 fs/cifs/cifs_spnego.c                  | 2 +-
 fs/cifs/cifsacl.c                      | 2 +-
 fs/ksmbd/smb_common.c                  | 2 +-
 fs/nfs/flexfilelayout/flexfilelayout.c | 4 ++--
 fs/nfs/nfs4idmap.c                     | 2 +-
 fs/nfsd/nfs4callback.c                 | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 342717bf1dc2..6f3285f1dfee 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -189,7 +189,7 @@ init_cifs_spnego(void)
 	 * spnego upcalls.
 	 */
 
-	cred = prepare_kernel_cred(NULL);
+	cred = prepare_kernel_cred(&init_task);
 	if (!cred)
 		return -ENOMEM;
 
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index fa480d62f313..574de2b225ae 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -465,7 +465,7 @@ init_cifs_idmap(void)
 	 * this is used to prevent malicious redirections from being installed
 	 * with add_key().
 	 */
-	cred = prepare_kernel_cred(NULL);
+	cred = prepare_kernel_cred(&init_task);
 	if (!cred)
 		return -ENOMEM;
 
diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c
index d96da872d70a..2a4fbbd55b91 100644
--- a/fs/ksmbd/smb_common.c
+++ b/fs/ksmbd/smb_common.c
@@ -623,7 +623,7 @@ int ksmbd_override_fsids(struct ksmbd_work *work)
 	if (share->force_gid != KSMBD_SHARE_INVALID_GID)
 		gid = share->force_gid;
 
-	cred = prepare_kernel_cred(NULL);
+	cred = prepare_kernel_cred(&init_task);
 	if (!cred)
 		return -ENOMEM;
 
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 1ec79ccf89ad..7deb3cd76abe 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -493,10 +493,10 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
 		gid = make_kgid(&init_user_ns, id);
 
 		if (gfp_flags & __GFP_FS)
-			kcred = prepare_kernel_cred(NULL);
+			kcred = prepare_kernel_cred(&init_task);
 		else {
 			unsigned int nofs_flags = memalloc_nofs_save();
-			kcred = prepare_kernel_cred(NULL);
+			kcred = prepare_kernel_cred(&init_task);
 			memalloc_nofs_restore(nofs_flags);
 		}
 		rc = -ENOMEM;
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index e3fdd2f45b01..25a7c771cfd8 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -203,7 +203,7 @@ int nfs_idmap_init(void)
 	printk(KERN_NOTICE "NFS: Registering the %s key type\n",
 		key_type_id_resolver.name);
 
-	cred = prepare_kernel_cred(NULL);
+	cred = prepare_kernel_cred(&init_task);
 	if (!cred)
 		return -ENOMEM;
 
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index f0e69edf5f0f..4a9e8d17e56a 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -870,7 +870,7 @@ static const struct cred *get_backchannel_cred(struct nfs4_client *clp, struct r
 	} else {
 		struct cred *kcred;
 
-		kcred = prepare_kernel_cred(NULL);
+		kcred = prepare_kernel_cred(&init_task);
 		if (!kcred)
 			return NULL;
 
-- 
cgit 


From 905889bc6c842d18f369bf2834cf7219f32709ae Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 23 Sep 2022 13:28:13 -0700
Subject: btrfs: send: Proactively round up to kmalloc bucket size

Instead of discovering the kmalloc bucket size _after_ allocation, round
up proactively so the allocation is explicitly made for the full size,
allowing the compiler to correctly reason about the resulting size of
the buffer through the existing __alloc_size() hint.

Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: linux-btrfs@vger.kernel.org
Acked-by: David Sterba <dsterba@suse.com>
Link: https://lore.kernel.org/lkml/20220922133014.GI32411@suse.cz
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220923202822.2667581-8-keescook@chromium.org
---
 fs/btrfs/send.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 4ef4167072b8..f53e8049473d 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -438,6 +438,11 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
 	path_len = p->end - p->start;
 	old_buf_len = p->buf_len;
 
+	/*
+	 * Allocate to the next largest kmalloc bucket size, to let
+	 * the fast path happen most of the time.
+	 */
+	len = kmalloc_size_roundup(len);
 	/*
 	 * First time the inline_buf does not suffice
 	 */
@@ -451,11 +456,7 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
 	if (!tmp_buf)
 		return -ENOMEM;
 	p->buf = tmp_buf;
-	/*
-	 * The real size of the buffer is bigger, this will let the fast path
-	 * happen most of the time
-	 */
-	p->buf_len = ksize(p->buf);
+	p->buf_len = len;
 
 	if (p->reversed) {
 		tmp_buf = p->buf + old_buf_len - path_len - 1;
-- 
cgit 


From 6dd142d9013ca82155d0c069434c60a0d5755ec0 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 20 Sep 2022 14:13:05 -0700
Subject: coredump: Proactively round up to kmalloc bucket size

Instead of discovering the kmalloc bucket size _after_ allocation, round
up proactively so the allocation is explicitly made for the full size,
allowing the compiler to correctly reason about the resulting size of
the buffer through the existing __alloc_size() hint.

Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 fs/coredump.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/coredump.c b/fs/coredump.c
index 7bad7785e8e6..97eaee325251 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -68,7 +68,10 @@ struct core_name {
 
 static int expand_corename(struct core_name *cn, int size)
 {
-	char *corename = krealloc(cn->corename, size, GFP_KERNEL);
+	char *corename;
+
+	size = kmalloc_size_roundup(size);
+	corename = krealloc(cn->corename, size, GFP_KERNEL);
 
 	if (!corename)
 		return -ENOMEM;
@@ -76,7 +79,7 @@ static int expand_corename(struct core_name *cn, int size)
 	if (size > core_name_size) /* racy but harmless */
 		core_name_size = size;
 
-	cn->size = ksize(corename);
+	cn->size = size;
 	cn->corename = corename;
 	return 0;
 }
-- 
cgit 


From 91586ce0d39a05f88795aa8814fb99b1387236b3 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Fri, 21 Oct 2022 10:34:22 +0800
Subject: f2fs: fix to invalidate dcc->f2fs_issue_discard in error path

Syzbot reports a NULL pointer dereference issue as below:

 __refcount_add include/linux/refcount.h:193 [inline]
 __refcount_inc include/linux/refcount.h:250 [inline]
 refcount_inc include/linux/refcount.h:267 [inline]
 get_task_struct include/linux/sched/task.h:110 [inline]
 kthread_stop+0x34/0x1c0 kernel/kthread.c:703
 f2fs_stop_discard_thread+0x3c/0x5c fs/f2fs/segment.c:1638
 kill_f2fs_super+0x5c/0x194 fs/f2fs/super.c:4522
 deactivate_locked_super+0x70/0xe8 fs/super.c:332
 deactivate_super+0xd0/0xd4 fs/super.c:363
 cleanup_mnt+0x1f8/0x234 fs/namespace.c:1186
 __cleanup_mnt+0x20/0x30 fs/namespace.c:1193
 task_work_run+0xc4/0x14c kernel/task_work.c:177
 exit_task_work include/linux/task_work.h:38 [inline]
 do_exit+0x26c/0xbe0 kernel/exit.c:795
 do_group_exit+0x60/0xe8 kernel/exit.c:925
 __do_sys_exit_group kernel/exit.c:936 [inline]
 __se_sys_exit_group kernel/exit.c:934 [inline]
 __wake_up_parent+0x0/0x40 kernel/exit.c:934
 __invoke_syscall arch/arm64/kernel/syscall.c:38 [inline]
 invoke_syscall arch/arm64/kernel/syscall.c:52 [inline]
 el0_svc_common+0x138/0x220 arch/arm64/kernel/syscall.c:142
 do_el0_svc+0x48/0x164 arch/arm64/kernel/syscall.c:206
 el0_svc+0x58/0x150 arch/arm64/kernel/entry-common.c:636
 el0t_64_sync_handler+0x84/0xf0 arch/arm64/kernel/entry-common.c:654
 el0t_64_sync+0x18c/0x190 arch/arm64/kernel/entry.S:581

The root cause of this issue is in error path of f2fs_start_discard_thread(),
it missed to invalidate dcc->f2fs_issue_discard, later kthread_stop() may
access invalid pointer.

Fixes: 4d67490498ac ("f2fs: Don't create discard thread when device doesn't support realtime discard")
Reported-by: syzbot+035a381ea1afb63f098d@syzkaller.appspotmail.com
Reported-by: syzbot+729c925c2d9fc495ddee@syzkaller.appspotmail.com
Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index acf3d3fa4363..7a4f7c88b8b9 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -2025,8 +2025,10 @@ int f2fs_start_discard_thread(struct f2fs_sb_info *sbi)
 
 	dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi,
 				"f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev));
-	if (IS_ERR(dcc->f2fs_issue_discard))
+	if (IS_ERR(dcc->f2fs_issue_discard)) {
 		err = PTR_ERR(dcc->f2fs_issue_discard);
+		dcc->f2fs_issue_discard = NULL;
+	}
 
 	return err;
 }
-- 
cgit 


From 18792e64c86dd7e34ba28e4f61faba472b7bf5fc Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Thu, 6 Oct 2022 23:09:28 +0800
Subject: f2fs: support fault injection for f2fs_is_valid_blkaddr()

This patch supports to inject fault into f2fs_is_valid_blkaddr() to
simulate accessing inconsistent data/meta block addressses from caller.

Usage:
a) echo 262144 > /sys/fs/f2fs/<dev>/inject_type or
b) mount -o fault_type=262144 <dev> <mountpoint>

Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c | 5 +++++
 fs/f2fs/f2fs.h       | 1 +
 fs/f2fs/super.c      | 1 +
 3 files changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 0c82dae082aa..c00694a50222 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -171,6 +171,11 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr,
 bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
 					block_t blkaddr, int type)
 {
+	if (time_to_inject(sbi, FAULT_BLKADDR)) {
+		f2fs_show_injection_info(sbi, FAULT_BLKADDR);
+		return false;
+	}
+
 	switch (type) {
 	case META_NAT:
 		break;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index e6355a5683b7..f57cb49dc383 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -60,6 +60,7 @@ enum {
 	FAULT_SLAB_ALLOC,
 	FAULT_DQUOT_INIT,
 	FAULT_LOCK_OP,
+	FAULT_BLKADDR,
 	FAULT_MAX,
 };
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 3834ead04620..df26fbe2bf58 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -61,6 +61,7 @@ const char *f2fs_fault_name[FAULT_MAX] = {
 	[FAULT_SLAB_ALLOC]	= "slab alloc",
 	[FAULT_DQUOT_INIT]	= "dquot initialize",
 	[FAULT_LOCK_OP]		= "lock_op",
+	[FAULT_BLKADDR]		= "invalid blkaddr",
 };
 
 void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate,
-- 
cgit 


From 3688cbe39b7a9ef3feb73234fb351de33fd1da52 Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Tue, 25 Oct 2022 11:08:31 +0800
Subject: f2fs: remove batched_trim_sections node

commit 377224c47118("f2fs: don't split checkpoint in fstrim") obsolete
batch mode and related sysfs entry.

Since this testing sysfs node has been deprecated for a long time, let's
remove it.

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h  | 3 ---
 fs/f2fs/sysfs.c | 5 -----
 2 files changed, 8 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index f57cb49dc383..e990870bdab9 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1063,9 +1063,6 @@ struct f2fs_sm_info {
 	/* a threshold to reclaim prefree segments */
 	unsigned int rec_prefree_segments;
 
-	/* for batched trimming */
-	unsigned int trim_sections;		/* # of sections to trim */
-
 	struct list_head sit_entry_set;	/* sit entry set list */
 
 	unsigned int ipu_policy;	/* in-place-update policy */
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index df27afd71ef4..926b7a844362 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -488,9 +488,6 @@ out:
 			return -EINVAL;
 	}
 
-	if (!strcmp(a->attr.name, "trim_sections"))
-		return -EINVAL;
-
 	if (!strcmp(a->attr.name, "gc_urgent")) {
 		if (t == 0) {
 			sbi->gc_mode = GC_NORMAL;
@@ -790,7 +787,6 @@ F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, mid_discard_issue_time, mid_discard_
 F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_issue_time, max_discard_issue_time);
 F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity);
 F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks);
-F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
@@ -919,7 +915,6 @@ static struct attribute *f2fs_attrs[] = {
 	ATTR_LIST(max_discard_issue_time),
 	ATTR_LIST(discard_granularity),
 	ATTR_LIST(pending_discard),
-	ATTR_LIST(batched_trim_sections),
 	ATTR_LIST(ipu_policy),
 	ATTR_LIST(min_ipu_util),
 	ATTR_LIST(min_fsync_blocks),
-- 
cgit 


From 6359a1aaca527311b7145ec6eb16890a5ddf5214 Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Tue, 25 Oct 2022 14:50:24 +0800
Subject: f2fs: fix gc mode when gc_urgent_high_remaining is 1

Under the current logic, when gc_urgent_high_remaining is set to 1,
the mode will be switched to normal at the beginning, instead of
running in gc_urgent mode.

Let's switch the gc mode back to normal when the gc ends.

Fixes: 265576181b4a ("f2fs: remove gc_urgent_high_limited for cleanup")
Signed-off-by: Yangtao Li <frank.li@vivo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/gc.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 7b4be412cec0..d2e9c280773f 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -96,16 +96,6 @@ static int gc_thread_func(void *data)
 		 * invalidated soon after by user update or deletion.
 		 * So, I'd like to wait some time to collect dirty segments.
 		 */
-		if (sbi->gc_mode == GC_URGENT_HIGH) {
-			spin_lock(&sbi->gc_urgent_high_lock);
-			if (sbi->gc_urgent_high_remaining) {
-				sbi->gc_urgent_high_remaining--;
-				if (!sbi->gc_urgent_high_remaining)
-					sbi->gc_mode = GC_NORMAL;
-			}
-			spin_unlock(&sbi->gc_urgent_high_lock);
-		}
-
 		if (sbi->gc_mode == GC_URGENT_HIGH ||
 				sbi->gc_mode == GC_URGENT_MID) {
 			wait_ms = gc_th->urgent_sleep_time;
@@ -162,6 +152,15 @@ do_gc:
 		/* balancing f2fs's metadata periodically */
 		f2fs_balance_fs_bg(sbi, true);
 next:
+		if (sbi->gc_mode == GC_URGENT_HIGH) {
+			spin_lock(&sbi->gc_urgent_high_lock);
+			if (sbi->gc_urgent_high_remaining) {
+				sbi->gc_urgent_high_remaining--;
+				if (!sbi->gc_urgent_high_remaining)
+					sbi->gc_mode = GC_NORMAL;
+			}
+			spin_unlock(&sbi->gc_urgent_high_lock);
+		}
 		sb_end_write(sbi->sb);
 
 	} while (!kthread_should_stop());
-- 
cgit 


From 44b9d01f2ee32884a7de270394b0fb7f75f87dba Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Tue, 25 Oct 2022 16:05:26 +0800
Subject: f2fs: cleanup in f2fs_create_flush_cmd_control()

Just cleanup for readable, no functional changes.

Suggested-by: Chao Yu <chao@kernel.org>
Signed-off-by: Yangtao Li <frank.li@vivo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 7a4f7c88b8b9..0df47ad80efb 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -620,12 +620,12 @@ int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi)
 {
 	dev_t dev = sbi->sb->s_bdev->bd_dev;
 	struct flush_cmd_control *fcc;
-	int err = 0;
+	int err;
 
 	if (SM_I(sbi)->fcc_info) {
 		fcc = SM_I(sbi)->fcc_info;
 		if (fcc->f2fs_issue_flush)
-			return err;
+			return 0;
 		goto init_thread;
 	}
 
@@ -638,7 +638,7 @@ int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi)
 	init_llist_head(&fcc->issue_list);
 	SM_I(sbi)->fcc_info = fcc;
 	if (!test_opt(sbi, FLUSH_MERGE))
-		return err;
+		return 0;
 
 init_thread:
 	fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi,
@@ -650,7 +650,7 @@ init_thread:
 		return err;
 	}
 
-	return err;
+	return 0;
 }
 
 void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free)
-- 
cgit 


From b5f1a218ae5e4339130d6e733f0e63d623e09a2c Mon Sep 17 00:00:00 2001
From: Dongdong Zhang <zhangdongdong1@oppo.com>
Date: Tue, 25 Oct 2022 17:40:36 +0800
Subject: f2fs: fix normal discard process

In the DPOLICY_BG mode, there is a conflict between
the two conditions "i + 1 < dpolicy->granularity" and
"i < DEFAULT_DISCARD_GRANULARITY". If i = 15, the first
condition is false, it will enter the second condition
and dispatch all small granularity discards in function
 __issue_discard_cmd_orderly. The restrictive effect
of the first condition to small discards will be
invalidated. These two conditions should align.

Fixes: 20ee4382322c ("f2fs: issue small discard by LBA order")
Signed-off-by: Dongdong Zhang <zhangdongdong1@oppo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 0df47ad80efb..38f6a2bcb158 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1448,7 +1448,7 @@ retry:
 		if (i + 1 < dpolicy->granularity)
 			break;
 
-		if (i < DEFAULT_DISCARD_GRANULARITY && dpolicy->ordered)
+		if (i + 1 < DEFAULT_DISCARD_GRANULARITY && dpolicy->ordered)
 			return __issue_discard_cmd_orderly(sbi, dpolicy);
 
 		pend_list = &dcc->pend_list[i];
-- 
cgit 


From 6047de5482c33d5f912cdc907336fde9ebc5714e Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Tue, 25 Oct 2022 01:54:01 +0800
Subject: f2fs: add barrier mount option

This patch adds a mount option, barrier, in f2fs.
The barrier option is the opposite of nobarrier.
If this option is set, cache_flush commands are allowed to be issued.

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index df26fbe2bf58..a247027711d8 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -111,6 +111,7 @@ enum {
 	Opt_noinline_dentry,
 	Opt_flush_merge,
 	Opt_noflush_merge,
+	Opt_barrier,
 	Opt_nobarrier,
 	Opt_fastboot,
 	Opt_extent_cache,
@@ -187,6 +188,7 @@ static match_table_t f2fs_tokens = {
 	{Opt_noinline_dentry, "noinline_dentry"},
 	{Opt_flush_merge, "flush_merge"},
 	{Opt_noflush_merge, "noflush_merge"},
+	{Opt_barrier, "barrier"},
 	{Opt_nobarrier, "nobarrier"},
 	{Opt_fastboot, "fastboot"},
 	{Opt_extent_cache, "extent_cache"},
@@ -807,6 +809,9 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
 		case Opt_nobarrier:
 			set_opt(sbi, NOBARRIER);
 			break;
+		case Opt_barrier:
+			clear_opt(sbi, NOBARRIER);
+			break;
 		case Opt_fastboot:
 			set_opt(sbi, FASTBOOT);
 			break;
@@ -1940,6 +1945,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 		seq_puts(seq, ",flush_merge");
 	if (test_opt(sbi, NOBARRIER))
 		seq_puts(seq, ",nobarrier");
+	else
+		seq_puts(seq, ",barrier");
 	if (test_opt(sbi, FASTBOOT))
 		seq_puts(seq, ",fastboot");
 	if (test_opt(sbi, EXTENT_CACHE))
-- 
cgit 


From a995627e6dd81d4485d40ce64880017a080d71e6 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 24 Oct 2022 16:00:35 -0700
Subject: f2fs: allow to set compression for inlined file

The below commit disallows to set compression on empty created file which
has a inline_data. Let's fix it.

Fixes: 7165841d578e ("f2fs: fix to check inline_data during compressed inode conversion")
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 82cda1258227..f96bbfa8b399 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1915,6 +1915,10 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask)
 			if (!f2fs_disable_compressed_file(inode))
 				return -EINVAL;
 		} else {
+			/* try to convert inline_data to support compression */
+			int err = f2fs_convert_inline_inode(inode);
+			if (err)
+				return err;
 			if (!f2fs_may_compress(inode))
 				return -EINVAL;
 			if (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode))
-- 
cgit 


From c46867e9b9b8e0cdd6a5212c2b5ae616583a3bfd Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Tue, 25 Oct 2022 16:32:26 +0800
Subject: f2fs: introduce max_ordered_discard sysfs node

The current max_ordered_discard is a fixed value, change it to be
configurable through the sys node.

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h    |  3 +++
 fs/f2fs/segment.c |  3 ++-
 fs/f2fs/sysfs.c   | 11 +++++++++++
 3 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index e990870bdab9..fa8dc00dfb2b 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -331,6 +331,8 @@ struct discard_entry {
 
 /* default discard granularity of inner discard thread, unit: block count */
 #define DEFAULT_DISCARD_GRANULARITY		16
+/* default maximum discard granularity of ordered discard, unit: block count */
+#define DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY	16
 
 /* max discard pend list number */
 #define MAX_PLIST_NUM		512
@@ -410,6 +412,7 @@ struct discard_cmd_control {
 	unsigned int mid_discard_issue_time;	/* mid. interval between discard issue */
 	unsigned int max_discard_issue_time;	/* max. interval between discard issue */
 	unsigned int discard_granularity;	/* discard granularity */
+	unsigned int max_ordered_discard;	/* maximum discard granularity issued by lba order */
 	unsigned int undiscard_blks;		/* # of undiscard blocks */
 	unsigned int next_pos;			/* next discard position */
 	atomic_t issued_discard;		/* # of issued discard */
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 38f6a2bcb158..c470b443615f 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1448,7 +1448,7 @@ retry:
 		if (i + 1 < dpolicy->granularity)
 			break;
 
-		if (i + 1 < DEFAULT_DISCARD_GRANULARITY && dpolicy->ordered)
+		if (i + 1 < dcc->max_ordered_discard && dpolicy->ordered)
 			return __issue_discard_cmd_orderly(sbi, dpolicy);
 
 		pend_list = &dcc->pend_list[i];
@@ -2048,6 +2048,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
 		return -ENOMEM;
 
 	dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY;
+	dcc->max_ordered_discard = DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY;
 	if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT)
 		dcc->discard_granularity = sbi->blocks_per_seg;
 	else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION)
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 926b7a844362..8095345ebdad 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -483,6 +483,15 @@ out:
 		return count;
 	}
 
+	if (!strcmp(a->attr.name, "max_ordered_discard")) {
+		if (t == 0 || t > MAX_PLIST_NUM)
+			return -EINVAL;
+		if (!f2fs_block_unit_discard(sbi))
+			return -EINVAL;
+		*ui = t;
+		return count;
+	}
+
 	if (!strcmp(a->attr.name, "migration_granularity")) {
 		if (t == 0 || t > sbi->segs_per_sec)
 			return -EINVAL;
@@ -786,6 +795,7 @@ F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, min_discard_issue_time, min_discard_
 F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, mid_discard_issue_time, mid_discard_issue_time);
 F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_issue_time, max_discard_issue_time);
 F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity);
+F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_ordered_discard, max_ordered_discard);
 F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
@@ -914,6 +924,7 @@ static struct attribute *f2fs_attrs[] = {
 	ATTR_LIST(mid_discard_issue_time),
 	ATTR_LIST(max_discard_issue_time),
 	ATTR_LIST(discard_granularity),
+	ATTR_LIST(max_ordered_discard),
 	ATTR_LIST(pending_discard),
 	ATTR_LIST(ipu_policy),
 	ATTR_LIST(min_ipu_util),
-- 
cgit 


From a5029a57a2f3f2e2711f4ac2d876c3c83d1758fe Mon Sep 17 00:00:00 2001
From: Keoseong Park <keosung.park@samsung.com>
Date: Thu, 27 Oct 2022 20:01:05 +0900
Subject: f2fs: Fix typo in comments

Change "truncateion" to "truncation".

Signed-off-by: Keoseong Park <keosung.park@samsung.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index f96bbfa8b399..c605a4f2bce2 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -571,7 +571,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
 	raw_node = F2FS_NODE(dn->node_page);
 	addr = blkaddr_in_node(raw_node) + base + ofs;
 
-	/* Assumption: truncateion starts with cluster */
+	/* Assumption: truncation starts with cluster */
 	for (; count > 0; count--, addr++, dn->ofs_in_node++, cluster_index++) {
 		block_t blkaddr = le32_to_cpu(*addr);
 
-- 
cgit 


From 146dbcbf17a6d07169e75224d949cc2670de2e20 Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Thu, 27 Oct 2022 18:24:46 +0800
Subject: f2fs: fix return val in f2fs_start_ckpt_thread()

Return PTR_ERR(cprc->f2fs_issue_ckpt) instead of -ENOMEM;

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c |  4 +++-
 fs/f2fs/gc.c         | 15 +++++++--------
 fs/f2fs/segment.c    |  4 ++--
 3 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index c00694a50222..56f7d0d6a8b2 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -1902,8 +1902,10 @@ int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi)
 	cprc->f2fs_issue_ckpt = kthread_run(issue_checkpoint_thread, sbi,
 			"f2fs_ckpt-%u:%u", MAJOR(dev), MINOR(dev));
 	if (IS_ERR(cprc->f2fs_issue_ckpt)) {
+		int err = PTR_ERR(cprc->f2fs_issue_ckpt);
+
 		cprc->f2fs_issue_ckpt = NULL;
-		return -ENOMEM;
+		return err;
 	}
 
 	set_task_ioprio(cprc->f2fs_issue_ckpt, cprc->ckpt_thread_ioprio);
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index d2e9c280773f..15f56859966c 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -171,13 +171,10 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi)
 {
 	struct f2fs_gc_kthread *gc_th;
 	dev_t dev = sbi->sb->s_bdev->bd_dev;
-	int err = 0;
 
 	gc_th = f2fs_kmalloc(sbi, sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
-	if (!gc_th) {
-		err = -ENOMEM;
-		goto out;
-	}
+	if (!gc_th)
+		return -ENOMEM;
 
 	gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME;
 	gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME;
@@ -192,12 +189,14 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi)
 	sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi,
 			"f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev));
 	if (IS_ERR(gc_th->f2fs_gc_task)) {
-		err = PTR_ERR(gc_th->f2fs_gc_task);
+		int err = PTR_ERR(gc_th->f2fs_gc_task);
+
 		kfree(gc_th);
 		sbi->gc_thread = NULL;
+		return err;
 	}
-out:
-	return err;
+
+	return 0;
 }
 
 void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index c470b443615f..c4270cd6eaab 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -620,7 +620,6 @@ int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi)
 {
 	dev_t dev = sbi->sb->s_bdev->bd_dev;
 	struct flush_cmd_control *fcc;
-	int err;
 
 	if (SM_I(sbi)->fcc_info) {
 		fcc = SM_I(sbi)->fcc_info;
@@ -644,7 +643,8 @@ init_thread:
 	fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi,
 				"f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev));
 	if (IS_ERR(fcc->f2fs_issue_flush)) {
-		err = PTR_ERR(fcc->f2fs_issue_flush);
+		int err = PTR_ERR(fcc->f2fs_issue_flush);
+
 		kfree(fcc);
 		SM_I(sbi)->fcc_info = NULL;
 		return err;
-- 
cgit 


From 7b02b2201893a71b881026cf574902019ab00db5 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Fri, 28 Oct 2022 17:30:26 +0800
Subject: f2fs: fix to destroy sbi->post_read_wq in error path of
 f2fs_fill_super()

In error path of f2fs_fill_super(), this patch fixes to call
f2fs_destroy_post_read_wq() once if we fail in f2fs_start_ckpt_thread().

Fixes: 261eeb9c1585 ("f2fs: introduce checkpoint_merge mount option")
Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index a247027711d8..e6365f040171 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -4531,9 +4531,9 @@ free_nm:
 	f2fs_destroy_node_manager(sbi);
 free_sm:
 	f2fs_destroy_segment_manager(sbi);
-	f2fs_destroy_post_read_wq(sbi);
 stop_ckpt_thread:
 	f2fs_stop_ckpt_thread(sbi);
+	f2fs_destroy_post_read_wq(sbi);
 free_devices:
 	destroy_device_list(sbi);
 	kvfree(sbi->ckpt);
-- 
cgit 


From a3951cd199a5d26138532d4e55af41262237632e Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Tue, 25 Oct 2022 11:32:16 +0800
Subject: f2fs: introduce gc_mode sysfs node

Revert "f2fs: make gc_urgent and gc_segment_mode sysfs node readable".

Add a gc_mode sysfs node to show the current gc_mode as a string.

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h  |  1 +
 fs/f2fs/sysfs.c | 15 +++++++++------
 2 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index fa8dc00dfb2b..662b27c19de1 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1319,6 +1319,7 @@ enum {
 	MAX_TIME,
 };
 
+/* Note that you need to keep synchronization with this gc_mode_names array */
 enum {
 	GC_NORMAL,
 	GC_IDLE_CB,
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 8095345ebdad..1fbd41c48328 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -143,6 +143,12 @@ static ssize_t pending_discard_show(struct f2fs_attr *a,
 				&SM_I(sbi)->dcc_info->discard_cmd_cnt));
 }
 
+static ssize_t gc_mode_show(struct f2fs_attr *a,
+		struct f2fs_sb_info *sbi, char *buf)
+{
+	return sysfs_emit(buf, "%s\n", gc_mode_names[sbi->gc_mode]);
+}
+
 static ssize_t features_show(struct f2fs_attr *a,
 		struct f2fs_sb_info *sbi, char *buf)
 {
@@ -332,13 +338,8 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
 		return sysfs_emit(buf, "%u\n", sbi->compr_new_inode);
 #endif
 
-	if (!strcmp(a->attr.name, "gc_urgent"))
-		return sysfs_emit(buf, "%s\n",
-				gc_mode_names[sbi->gc_mode]);
-
 	if (!strcmp(a->attr.name, "gc_segment_mode"))
-		return sysfs_emit(buf, "%s\n",
-				gc_mode_names[sbi->gc_segment_mode]);
+		return sysfs_emit(buf, "%u\n", sbi->gc_segment_mode);
 
 	if (!strcmp(a->attr.name, "gc_reclaimed_segments")) {
 		return sysfs_emit(buf, "%u\n",
@@ -844,6 +845,7 @@ F2FS_GENERAL_RO_ATTR(encoding);
 F2FS_GENERAL_RO_ATTR(mounted_time_sec);
 F2FS_GENERAL_RO_ATTR(main_blkaddr);
 F2FS_GENERAL_RO_ATTR(pending_discard);
+F2FS_GENERAL_RO_ATTR(gc_mode);
 #ifdef CONFIG_F2FS_STAT_FS
 F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_foreground_calls, cp_count);
 F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_background_calls, bg_cp_count);
@@ -926,6 +928,7 @@ static struct attribute *f2fs_attrs[] = {
 	ATTR_LIST(discard_granularity),
 	ATTR_LIST(max_ordered_discard),
 	ATTR_LIST(pending_discard),
+	ATTR_LIST(gc_mode),
 	ATTR_LIST(ipu_policy),
 	ATTR_LIST(min_ipu_util),
 	ATTR_LIST(min_fsync_blocks),
-- 
cgit 


From 23ddc81b087c8bd9a73272afa076e204dc2e5410 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 28 Oct 2022 09:49:53 -0700
Subject: f2fs: use sysfs_emit instead of sprintf

Let's use sysfs_emit.

Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/sysfs.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 1fbd41c48328..af23ed6121b0 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -95,28 +95,28 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
 static ssize_t dirty_segments_show(struct f2fs_attr *a,
 		struct f2fs_sb_info *sbi, char *buf)
 {
-	return sprintf(buf, "%llu\n",
+	return sysfs_emit(buf, "%llu\n",
 			(unsigned long long)(dirty_segments(sbi)));
 }
 
 static ssize_t free_segments_show(struct f2fs_attr *a,
 		struct f2fs_sb_info *sbi, char *buf)
 {
-	return sprintf(buf, "%llu\n",
+	return sysfs_emit(buf, "%llu\n",
 			(unsigned long long)(free_segments(sbi)));
 }
 
 static ssize_t ovp_segments_show(struct f2fs_attr *a,
 		struct f2fs_sb_info *sbi, char *buf)
 {
-	return sprintf(buf, "%llu\n",
+	return sysfs_emit(buf, "%llu\n",
 			(unsigned long long)(overprovision_segments(sbi)));
 }
 
 static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a,
 		struct f2fs_sb_info *sbi, char *buf)
 {
-	return sprintf(buf, "%llu\n",
+	return sysfs_emit(buf, "%llu\n",
 			(unsigned long long)(sbi->kbytes_written +
 			((f2fs_get_sectors_written(sbi) -
 				sbi->sectors_written_start) >> 1)));
@@ -125,13 +125,13 @@ static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a,
 static ssize_t sb_status_show(struct f2fs_attr *a,
 		struct f2fs_sb_info *sbi, char *buf)
 {
-	return sprintf(buf, "%lx\n", sbi->s_flag);
+	return sysfs_emit(buf, "%lx\n", sbi->s_flag);
 }
 
 static ssize_t cp_status_show(struct f2fs_attr *a,
 		struct f2fs_sb_info *sbi, char *buf)
 {
-	return sprintf(buf, "%x\n", le32_to_cpu(F2FS_CKPT(sbi)->ckpt_flags));
+	return sysfs_emit(buf, "%x\n", le32_to_cpu(F2FS_CKPT(sbi)->ckpt_flags));
 }
 
 static ssize_t pending_discard_show(struct f2fs_attr *a,
@@ -139,7 +139,7 @@ static ssize_t pending_discard_show(struct f2fs_attr *a,
 {
 	if (!SM_I(sbi)->dcc_info)
 		return -EINVAL;
-	return sprintf(buf, "%llu\n", (unsigned long long)atomic_read(
+	return sysfs_emit(buf, "%llu\n", (unsigned long long)atomic_read(
 				&SM_I(sbi)->dcc_info->discard_cmd_cnt));
 }
 
@@ -205,7 +205,7 @@ static ssize_t features_show(struct f2fs_attr *a,
 static ssize_t current_reserved_blocks_show(struct f2fs_attr *a,
 					struct f2fs_sb_info *sbi, char *buf)
 {
-	return sprintf(buf, "%u\n", sbi->current_reserved_blocks);
+	return sysfs_emit(buf, "%u\n", sbi->current_reserved_blocks);
 }
 
 static ssize_t unusable_show(struct f2fs_attr *a,
@@ -217,7 +217,7 @@ static ssize_t unusable_show(struct f2fs_attr *a,
 		unusable = sbi->unusable_block_count;
 	else
 		unusable = f2fs_get_unusable_blocks(sbi);
-	return sprintf(buf, "%llu\n", (unsigned long long)unusable);
+	return sysfs_emit(buf, "%llu\n", (unsigned long long)unusable);
 }
 
 static ssize_t encoding_show(struct f2fs_attr *a,
@@ -232,13 +232,13 @@ static ssize_t encoding_show(struct f2fs_attr *a,
 			(sb->s_encoding->version >> 8) & 0xff,
 			sb->s_encoding->version & 0xff);
 #endif
-	return sprintf(buf, "(none)");
+	return sysfs_emit(buf, "(none)");
 }
 
 static ssize_t mounted_time_sec_show(struct f2fs_attr *a,
 		struct f2fs_sb_info *sbi, char *buf)
 {
-	return sprintf(buf, "%llu", SIT_I(sbi)->mounted_time);
+	return sysfs_emit(buf, "%llu", SIT_I(sbi)->mounted_time);
 }
 
 #ifdef CONFIG_F2FS_STAT_FS
@@ -247,7 +247,7 @@ static ssize_t moved_blocks_foreground_show(struct f2fs_attr *a,
 {
 	struct f2fs_stat_info *si = F2FS_STAT(sbi);
 
-	return sprintf(buf, "%llu\n",
+	return sysfs_emit(buf, "%llu\n",
 		(unsigned long long)(si->tot_blks -
 			(si->bg_data_blks + si->bg_node_blks)));
 }
@@ -257,7 +257,7 @@ static ssize_t moved_blocks_background_show(struct f2fs_attr *a,
 {
 	struct f2fs_stat_info *si = F2FS_STAT(sbi);
 
-	return sprintf(buf, "%llu\n",
+	return sysfs_emit(buf, "%llu\n",
 		(unsigned long long)(si->bg_data_blks + si->bg_node_blks));
 }
 
@@ -268,7 +268,7 @@ static ssize_t avg_vblocks_show(struct f2fs_attr *a,
 
 	si->dirty_count = dirty_segments(sbi);
 	f2fs_update_sit_info(sbi);
-	return sprintf(buf, "%llu\n", (unsigned long long)(si->avg_vblocks));
+	return sysfs_emit(buf, "%llu\n", (unsigned long long)(si->avg_vblocks));
 }
 #endif
 
@@ -363,7 +363,7 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
 
 	ui = (unsigned int *)(ptr + a->offset);
 
-	return sprintf(buf, "%u\n", *ui);
+	return sysfs_emit(buf, "%u\n", *ui);
 }
 
 static ssize_t __sbi_store(struct f2fs_attr *a,
@@ -728,7 +728,7 @@ static void f2fs_sb_release(struct kobject *kobj)
 static ssize_t f2fs_feature_show(struct f2fs_attr *a,
 		struct f2fs_sb_info *sbi, char *buf)
 {
-	return sprintf(buf, "supported\n");
+	return sysfs_emit(buf, "supported\n");
 }
 
 #define F2FS_FEATURE_RO_ATTR(_name)				\
@@ -741,8 +741,8 @@ static ssize_t f2fs_sb_feature_show(struct f2fs_attr *a,
 		struct f2fs_sb_info *sbi, char *buf)
 {
 	if (F2FS_HAS_FEATURE(sbi, a->id))
-		return sprintf(buf, "supported\n");
-	return sprintf(buf, "unsupported\n");
+		return sysfs_emit(buf, "supported\n");
+	return sysfs_emit(buf, "unsupported\n");
 }
 
 #define F2FS_SB_FEATURE_RO_ATTR(_name, _feat)			\
-- 
cgit 


From e5a0db6a9e2eafe50e3ebc73a8285ae561e7d850 Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Tue, 25 Oct 2022 14:50:25 +0800
Subject: f2fs: replace gc_urgent_high_remaining with gc_remaining_trials

The user can set the trial count limit for GC urgent and
idle mode with replaced gc_remaining_trials.. If GC thread gets
to the limit, the mode will turn back to GC normal mode finally.

It was applied only to GC_URGENT, while this patch expands it for
GC_IDLE.

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h  |  5 +++--
 fs/f2fs/gc.c    | 12 ++++++------
 fs/f2fs/super.c |  2 +-
 fs/f2fs/sysfs.c | 12 ++++++------
 4 files changed, 16 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 662b27c19de1..04ef4cce3d7f 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1736,8 +1736,9 @@ struct f2fs_sb_info {
 	unsigned int cur_victim_sec;		/* current victim section num */
 	unsigned int gc_mode;			/* current GC state */
 	unsigned int next_victim_seg[2];	/* next segment in victim section */
-	spinlock_t gc_urgent_high_lock;
-	unsigned int gc_urgent_high_remaining;	/* remaining trial count for GC_URGENT_HIGH */
+	spinlock_t gc_remaining_trials_lock;
+	/* remaining trial count for GC_URGENT_* and GC_IDLE_* */
+	unsigned int gc_remaining_trials;
 
 	/* for skip statistic */
 	unsigned long long skipped_gc_rwsem;		/* FG_GC only */
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 15f56859966c..6466db75af5d 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -152,14 +152,14 @@ do_gc:
 		/* balancing f2fs's metadata periodically */
 		f2fs_balance_fs_bg(sbi, true);
 next:
-		if (sbi->gc_mode == GC_URGENT_HIGH) {
-			spin_lock(&sbi->gc_urgent_high_lock);
-			if (sbi->gc_urgent_high_remaining) {
-				sbi->gc_urgent_high_remaining--;
-				if (!sbi->gc_urgent_high_remaining)
+		if (sbi->gc_mode != GC_NORMAL) {
+			spin_lock(&sbi->gc_remaining_trials_lock);
+			if (sbi->gc_remaining_trials) {
+				sbi->gc_remaining_trials--;
+				if (!sbi->gc_remaining_trials)
 					sbi->gc_mode = GC_NORMAL;
 			}
-			spin_unlock(&sbi->gc_urgent_high_lock);
+			spin_unlock(&sbi->gc_remaining_trials_lock);
 		}
 		sb_end_write(sbi->sb);
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index e6365f040171..a43d8a46a6e5 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -3624,7 +3624,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
 	sbi->seq_file_ra_mul = MIN_RA_MUL;
 	sbi->max_fragment_chunk = DEF_FRAGMENT_SIZE;
 	sbi->max_fragment_hole = DEF_FRAGMENT_SIZE;
-	spin_lock_init(&sbi->gc_urgent_high_lock);
+	spin_lock_init(&sbi->gc_remaining_trials_lock);
 	atomic64_set(&sbi->current_atomic_write, 0);
 
 	sbi->dir_level = DEF_DIR_LEVEL;
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index af23ed6121b0..032c03e09580 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -538,10 +538,10 @@ out:
 		return count;
 	}
 
-	if (!strcmp(a->attr.name, "gc_urgent_high_remaining")) {
-		spin_lock(&sbi->gc_urgent_high_lock);
-		sbi->gc_urgent_high_remaining = t;
-		spin_unlock(&sbi->gc_urgent_high_lock);
+	if (!strcmp(a->attr.name, "gc_remaining_trials")) {
+		spin_lock(&sbi->gc_remaining_trials_lock);
+		sbi->gc_remaining_trials = t;
+		spin_unlock(&sbi->gc_remaining_trials_lock);
 
 		return count;
 	}
@@ -832,7 +832,7 @@ F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type);
 #endif
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, data_io_flag, data_io_flag);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag);
-F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent_high_remaining, gc_urgent_high_remaining);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_remaining_trials, gc_remaining_trials);
 F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, ckpt_thread_ioprio, ckpt_thread_ioprio);
 F2FS_GENERAL_RO_ATTR(dirty_segments);
 F2FS_GENERAL_RO_ATTR(free_segments);
@@ -961,7 +961,7 @@ static struct attribute *f2fs_attrs[] = {
 #endif
 	ATTR_LIST(data_io_flag),
 	ATTR_LIST(node_io_flag),
-	ATTR_LIST(gc_urgent_high_remaining),
+	ATTR_LIST(gc_remaining_trials),
 	ATTR_LIST(ckpt_thread_ioprio),
 	ATTR_LIST(dirty_segments),
 	ATTR_LIST(free_segments),
-- 
cgit 


From 3b21b794b5797d35f4fad930b53b1cd881c12dd3 Mon Sep 17 00:00:00 2001
From: "wangkailong@jari.cn" <wangkailong@jari.cn>
Date: Sat, 29 Oct 2022 22:49:30 +0800
Subject: f2fs: replace ternary operator with max()

Fix the following coccicheck warning:

./fs/f2fs/segment.c:877:24-25: WARNING opportunity for max()

Signed-off-by: KaiLong Wang <wangkailong@jari.cn>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index c4270cd6eaab..aa4be7f25963 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -856,7 +856,7 @@ block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi)
 	}
 	mutex_unlock(&dirty_i->seglist_lock);
 
-	unusable = holes[DATA] > holes[NODE] ? holes[DATA] : holes[NODE];
+	unusable = max(holes[DATA], holes[NODE]);
 	if (unusable > ovp_holes)
 		return unusable - ovp_holes;
 	return 0;
-- 
cgit 


From 5b52aebef8954cadff29918bb61d7fdc7be07837 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 3 Nov 2022 08:18:46 +0100
Subject: ovl: call posix_acl_release() after error checking

The current placement of posix_acl_release() in ovl_set_or_remove_acl()
means it can be called on an error pointer instead of actual acls.
Fix this by moving the posix_acl_release() call after the error handling.

Fixes: 0e641857322f ("ovl: implement set acl method") # mainline only
Reported-by: syzbot+3f6ef1c4586bb6fd1f61@syzkaller.appspotmail.com
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/overlayfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 77a77fd7a77b..ee6dfa577c93 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -621,11 +621,11 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode,
 		real_acl = vfs_get_acl(mnt_user_ns(realpath.mnt), realdentry,
 				       acl_name);
 		revert_creds(old_cred);
-		posix_acl_release(real_acl);
 		if (IS_ERR(real_acl)) {
 			err = PTR_ERR(real_acl);
 			goto out_drop_write;
 		}
+		posix_acl_release(real_acl);
 	}
 
 	if (!upperdentry) {
-- 
cgit 


From 118e021b4b66f758f8e8f21dc0e5e0a4c721e69e Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 7 Nov 2022 10:09:11 +1100
Subject: xfs: write page faults in iomap are not buffered writes

When we reserve a delalloc region in xfs_buffered_write_iomap_begin,
we mark the iomap as IOMAP_F_NEW so that the the write context
understands that it allocated the delalloc region.

If we then fail that buffered write, xfs_buffered_write_iomap_end()
checks for the IOMAP_F_NEW flag and if it is set, it punches out
the unused delalloc region that was allocated for the write.

The assumption this code makes is that all buffered write operations
that can allocate space are run under an exclusive lock (i_rwsem).
This is an invalid assumption: page faults in mmap()d regions call
through this same function pair to map the file range being faulted
and this runs only holding the inode->i_mapping->invalidate_lock in
shared mode.

IOWs, we can have races between page faults and write() calls that
fail the nested page cache write operation that result in data loss.
That is, the failing iomap_end call will punch out the data that
the other racing iomap iteration brought into the page cache. This
can be reproduced with generic/34[46] if we arbitrarily fail page
cache copy-in operations from write() syscalls.

Code analysis tells us that the iomap_page_mkwrite() function holds
the already instantiated and uptodate folio locked across the iomap
mapping iterations. Hence the folio cannot be removed from memory
whilst we are mapping the range it covers, and as such we do not
care if the mapping changes state underneath the iomap iteration
loop:

1. if the folio is not already dirty, there is no writeback races
   possible.
2. if we allocated the mapping (delalloc or unwritten), the folio
   cannot already be dirty. See #1.
3. If the folio is already dirty, it must be up to date. As we hold
   it locked, it cannot be reclaimed from memory. Hence we always
   have valid data in the page cache while iterating the mapping.
4. Valid data in the page cache can exist when the underlying
   mapping is DELALLOC, UNWRITTEN or WRITTEN. Having the mapping
   change from DELALLOC->UNWRITTEN or UNWRITTEN->WRITTEN does not
   change the data in the page - it only affects actions if we are
   initialising a new page. Hence #3 applies  and we don't care
   about these extent map transitions racing with
   iomap_page_mkwrite().
5. iomap_page_mkwrite() checks for page invalidation races
   (truncate, hole punch, etc) after it locks the folio. We also
   hold the mapping->invalidation_lock here, and hence the mapping
   cannot change due to extent removal operations while we are
   iterating the folio.

As such, filesystems that don't use bufferheads will never fail
the iomap_folio_mkwrite_iter() operation on the current mapping,
regardless of whether the iomap should be considered stale.

Further, the range we are asked to iterate is limited to the range
inside EOF that the folio spans. Hence, for XFS, we will only map
the exact range we are asked for, and we will only do speculative
preallocation with delalloc if we are mapping a hole at the EOF
page. The iterator will consume the entire range of the folio that
is within EOF, and anything beyond the EOF block cannot be accessed.
We never need to truncate this post-EOF speculative prealloc away in
the context of the iomap_page_mkwrite() iterator because if it
remains unused we'll remove it when the last reference to the inode
goes away.

Hence we don't actually need an .iomap_end() cleanup/error handling
path at all for iomap_page_mkwrite() for XFS. This means we can
separate the page fault processing from the complexity of the
.iomap_end() processing in the buffered write path. This also means
that the buffered write path will also be able to take the
mapping->invalidate_lock as necessary.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_file.c  | 2 +-
 fs/xfs/xfs_iomap.c | 9 +++++++++
 fs/xfs/xfs_iomap.h | 1 +
 3 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e462d39c840e..595a5bcf46b9 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1325,7 +1325,7 @@ __xfs_filemap_fault(
 		if (write_fault) {
 			xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 			ret = iomap_page_mkwrite(vmf,
-					&xfs_buffered_write_iomap_ops);
+					&xfs_page_mkwrite_iomap_ops);
 			xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 		} else {
 			ret = filemap_fault(vmf);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 07da03976ec1..5cea069a38b4 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1187,6 +1187,15 @@ const struct iomap_ops xfs_buffered_write_iomap_ops = {
 	.iomap_end		= xfs_buffered_write_iomap_end,
 };
 
+/*
+ * iomap_page_mkwrite() will never fail in a way that requires delalloc extents
+ * that it allocated to be revoked. Hence we do not need an .iomap_end method
+ * for this operation.
+ */
+const struct iomap_ops xfs_page_mkwrite_iomap_ops = {
+	.iomap_begin		= xfs_buffered_write_iomap_begin,
+};
+
 static int
 xfs_read_iomap_begin(
 	struct inode		*inode,
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index c782e8c0479c..0f62ab633040 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -47,6 +47,7 @@ xfs_aligned_fsb_count(
 }
 
 extern const struct iomap_ops xfs_buffered_write_iomap_ops;
+extern const struct iomap_ops xfs_page_mkwrite_iomap_ops;
 extern const struct iomap_ops xfs_direct_write_iomap_ops;
 extern const struct iomap_ops xfs_read_iomap_ops;
 extern const struct iomap_ops xfs_seek_iomap_ops;
-- 
cgit 


From 42da66ac7bcb19181385e851094ceedfe7c81984 Mon Sep 17 00:00:00 2001
From: Michael Weiß <michael.weiss@aisec.fraunhofer.de>
Date: Mon, 24 Oct 2022 21:15:52 +0200
Subject: squashfs: enable idmapped mounts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For squashfs all needed functionality for idmapped mounts is already
implemented by the generic handlers in the VFS. Thus, it is sufficient
to just enable the corresponding FS_ALLOW_IDMAP flag to support
idmapped mounts.

We use this for unprivileged (user namespaced) containers based on
squashfs images as rootfs in GyroidOS.

A simple test using the mount-idmapped tool executed as user with
uid=1000 looks as follows:

$ mkdir test
$ echo "test" > test/test_file
$ mksquashfs test/ fs.img
$ sudo mkdir /mnt/test
$ sudo mkdir /mnt/mapped
$ sudo mount fs.img -o loop /mnt/test/
$ sudo ./mount-idmapped --map-mount b:1000:2000:1 /mnt/test/ /mnt/mapped/

$ mount | tail -n2
fs.img on /mnt/test type squashfs (ro,relatime,errors=continue)
fs.img on /mnt/mapped type squashfs (ro,relatime,idmapped,errors=continue)

$ ls -lan /mnt/test/
total 5
drwxr-xr-x 2 1000 1000   32 Okt 24 13:36 .
drwxr-xr-x 6    0    0 4096 Okt 24 13:38 ..
-rw-r--r-- 1 1000 1000    5 Okt 24 13:36 test_file

$ ls -lan /mnt/mapped/
total 5
drwxr-xr-x 2 2000 2000   32 Okt 24 13:36 .
drwxr-xr-x 6    0    0 4096 Okt 24 13:38 ..
-rw-r--r-- 1 2000 2000    5 Okt 24 13:36 test_file

Signed-off-by: Michael Weiß <michael.weiss@aisec.fraunhofer.de>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Reviewed-by: Phillip Lougher <phillip@squashfs.org.uk>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/squashfs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 32565dafa7f3..2636cb354435 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -568,7 +568,7 @@ static struct file_system_type squashfs_fs_type = {
 	.init_fs_context = squashfs_init_fs_context,
 	.parameters = squashfs_fs_parameters,
 	.kill_sb = kill_block_super,
-	.fs_flags = FS_REQUIRES_DEV
+	.fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
 };
 MODULE_ALIAS_FS("squashfs");
 
-- 
cgit 


From e7eda157c4071cd1e69f4b1687b0fbe1ae5e6f46 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Thu, 3 Nov 2022 16:12:05 +0100
Subject: fs: don't audit the capability check in simple_xattr_list()

The check being unconditional may lead to unwanted denials reported by
LSMs when a process has the capability granted by DAC, but denied by an
LSM. In the case of SELinux such denials are a problem, since they can't
be effectively filtered out via the policy and when not silenced, they
produce noise that may hide a true problem or an attack.

Checking for the capability only if any trusted xattr is actually
present wouldn't really address the issue, since calling listxattr(2) on
such node on its own doesn't indicate an explicit attempt to see the
trusted xattrs. Additionally, it could potentially leak the presence of
trusted xattrs to an unprivileged user if they can check for the denials
(e.g. through dmesg).

Therefore, it's best (and simplest) to keep the check unconditional and
instead use ns_capable_noaudit() that will silence any associated LSM
denials.

Fixes: 38f38657444d ("xattr: extract simple_xattr code from tmpfs")
Reported-by: Martin Pitt <mpitt@redhat.com>
Suggested-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
Reviewed-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Reviewed-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/xattr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xattr.c b/fs/xattr.c
index 61107b6bbed2..427b8cea1f96 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -1140,7 +1140,7 @@ static int xattr_list_one(char **buffer, ssize_t *remaining_size,
 ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
 			  char *buffer, size_t size)
 {
-	bool trusted = capable(CAP_SYS_ADMIN);
+	bool trusted = ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN);
 	struct simple_xattr *xattr;
 	ssize_t remaining_size = size;
 	int err = 0;
-- 
cgit 


From f6b1a1cf1c3ee430d3f5e47847047ce789a690aa Mon Sep 17 00:00:00 2001
From: Baokun Li <libaokun1@huawei.com>
Date: Thu, 22 Sep 2022 20:04:34 +0800
Subject: ext4: fix use-after-free in ext4_ext_shift_extents
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If the starting position of our insert range happens to be in the hole
between the two ext4_extent_idx, because the lblk of the ext4_extent in
the previous ext4_extent_idx is always less than the start, which leads
to the "extent" variable access across the boundary, the following UAF is
triggered:
==================================================================
BUG: KASAN: use-after-free in ext4_ext_shift_extents+0x257/0x790
Read of size 4 at addr ffff88819807a008 by task fallocate/8010
CPU: 3 PID: 8010 Comm: fallocate Tainted: G            E     5.10.0+ #492
Call Trace:
 dump_stack+0x7d/0xa3
 print_address_description.constprop.0+0x1e/0x220
 kasan_report.cold+0x67/0x7f
 ext4_ext_shift_extents+0x257/0x790
 ext4_insert_range+0x5b6/0x700
 ext4_fallocate+0x39e/0x3d0
 vfs_fallocate+0x26f/0x470
 ksys_fallocate+0x3a/0x70
 __x64_sys_fallocate+0x4f/0x60
 do_syscall_64+0x33/0x40
 entry_SYSCALL_64_after_hwframe+0x44/0xa9
==================================================================

For right shifts, we can divide them into the following situations：

1. When the first ee_block of ext4_extent_idx is greater than or equal to
   start, make right shifts directly from the first ee_block.
    1) If it is greater than start, we need to continue searching in the
       previous ext4_extent_idx.
    2) If it is equal to start, we can exit the loop (iterator=NULL).

2. When the first ee_block of ext4_extent_idx is less than start, then
   traverse from the last extent to find the first extent whose ee_block
   is less than start.
    1) If extent is still the last extent after traversal, it means that
       the last ee_block of ext4_extent_idx is less than start, that is,
       start is located in the hole between idx and (idx+1), so we can
       exit the loop directly (break) without right shifts.
    2) Otherwise, make right shifts at the corresponding position of the
       found extent, and then exit the loop (iterator=NULL).

Fixes: 331573febb6a ("ext4: Add support FALLOC_FL_INSERT_RANGE for fallocate")
Cc: stable@vger.kernel.org # v4.2+
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Baokun Li <libaokun1@huawei.com>
Link: https://lore.kernel.org/r/20220922120434.1294789-1-libaokun1@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/extents.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index f1956288307f..6c399a8b22b3 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -5184,6 +5184,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
 	 * and it is decreased till we reach start.
 	 */
 again:
+	ret = 0;
 	if (SHIFT == SHIFT_LEFT)
 		iterator = &start;
 	else
@@ -5227,14 +5228,21 @@ again:
 					ext4_ext_get_actual_len(extent);
 		} else {
 			extent = EXT_FIRST_EXTENT(path[depth].p_hdr);
-			if (le32_to_cpu(extent->ee_block) > 0)
+			if (le32_to_cpu(extent->ee_block) > start)
 				*iterator = le32_to_cpu(extent->ee_block) - 1;
-			else
-				/* Beginning is reached, end of the loop */
+			else if (le32_to_cpu(extent->ee_block) == start)
 				iterator = NULL;
-			/* Update path extent in case we need to stop */
-			while (le32_to_cpu(extent->ee_block) < start)
+			else {
+				extent = EXT_LAST_EXTENT(path[depth].p_hdr);
+				while (le32_to_cpu(extent->ee_block) >= start)
+					extent--;
+
+				if (extent == EXT_LAST_EXTENT(path[depth].p_hdr))
+					break;
+
 				extent++;
+				iterator = NULL;
+			}
 			path[depth].p_ext = extent;
 		}
 		ret = ext4_ext_shift_path_extents(path, shift, inode,
-- 
cgit 


From d96d0f9617793b6cd95b9b9a8fef66f69f8f6b1b Mon Sep 17 00:00:00 2001
From: Paulo Miguel Almeida <paulo.miguel.almeida.rodenas@gmail.com>
Date: Wed, 12 Oct 2022 09:23:14 +1300
Subject: dlm: replace one-element array with fixed size array

One-element arrays are deprecated. So, replace one-element array with
fixed size array member in struct dlm_ls, and refactor the rest of the
code, accordingly.

Link: https://github.com/KSPP/linux/issues/79
Link: https://github.com/KSPP/linux/issues/228
Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101836
Link: https://lore.kernel.org/lkml/Y0W5jkiXUkpNl4ap@mail.google.com/

Signed-off-by: Paulo Miguel Almeida <paulo.miguel.almeida.rodenas@gmail.com>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/dlm_internal.h | 2 +-
 fs/dlm/lockspace.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index e34c3d2639a5..94fadb619ba0 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -670,7 +670,7 @@ struct dlm_ls {
 	void			*ls_ops_arg;
 
 	int			ls_namelen;
-	char			ls_name[1];
+	char			ls_name[DLM_LOCKSPACE_LEN + 1];
 };
 
 /*
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index bae050df7abf..9479c8110979 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -473,7 +473,7 @@ static int new_lockspace(const char *name, const char *cluster,
 
 	error = -ENOMEM;
 
-	ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_NOFS);
+	ls = kzalloc(sizeof(*ls), GFP_NOFS);
 	if (!ls)
 		goto out;
 	memcpy(ls->ls_name, name, namelen);
-- 
cgit 


From 08ae0547e75ec3d062b6b6b9cf4830c730df68df Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Thu, 27 Oct 2022 16:45:11 -0400
Subject: fs: dlm: fix sock release if listen fails

This patch fixes a double sock_release() call when the listen() is
called for the dlm lowcomms listen socket. The caller of
dlm_listen_for_all should never care about releasing the socket if
dlm_listen_for_all() fails, it's done now only once if listen() fails.

Cc: stable@vger.kernel.org
Fixes: 2dc6b1158c28 ("fs: dlm: introduce generic listen")
Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 59f64c596233..2cb9f3b49e05 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1820,7 +1820,7 @@ static int dlm_listen_for_all(void)
 	result = sock->ops->listen(sock, 5);
 	if (result < 0) {
 		dlm_close_sock(&listen_con.sock);
-		goto out;
+		return result;
 	}
 
 	return 0;
@@ -2023,7 +2023,6 @@ fail_listen:
 	dlm_proto_ops = NULL;
 fail_proto_ops:
 	dlm_allow_conn = 0;
-	dlm_close_sock(&listen_con.sock);
 	work_stop();
 fail_local:
 	deinit_local();
-- 
cgit 


From f0f4bb431bd543ed7bebbaea3ce326cfcd5388bc Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Thu, 27 Oct 2022 16:45:12 -0400
Subject: fs: dlm: retry accept() until -EAGAIN or error returns

This patch fixes a race if we get two times an socket data ready event
while the listen connection worker is queued. Currently it will be
served only once but we need to do it (in this case twice) until we hit
-EAGAIN which tells us there is no pending accept going on.

This patch wraps an do while loop until we receive a return value which
is different than 0 as it was done before commit d11ccd451b65 ("fs: dlm:
listen socket out of connection hash").

Cc: stable@vger.kernel.org
Fixes: d11ccd451b65 ("fs: dlm: listen socket out of connection hash")
Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 2cb9f3b49e05..871d4e9f49fb 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1543,7 +1543,11 @@ static void process_recv_sockets(struct work_struct *work)
 
 static void process_listen_recv_socket(struct work_struct *work)
 {
-	accept_from_sock(&listen_con);
+	int ret;
+
+	do {
+		ret = accept_from_sock(&listen_con);
+	} while (!ret);
 }
 
 static void dlm_connect(struct connection *con)
-- 
cgit 


From 57a5724ef0b332eb6e78250157910a006b01bf6e Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Thu, 27 Oct 2022 16:45:13 -0400
Subject: fs: dlm: remove send repeat remove handling

This patch removes the send repeat remove handling. This handling is
there to repeatingly DLM_MSG_REMOVE messages in cases the dlm stack
thinks it was not received at the first time. In cases of message drops
this functionality is necessary, but since the DLM midcomms layer
guarantees there are no messages drops between cluster nodes this
feature became not strict necessary anymore. Due message
delays/processing it could be that two send_repeat_remove() are sent out
while the other should be still on it's way. We remove the repeat remove
handling because we are sure that the message cannot be dropped due
communication errors.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lock.c | 74 -----------------------------------------------------------
 1 file changed, 74 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 94a72ede5764..b246d71b5e17 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -4044,66 +4044,6 @@ out:
 	return error;
 }
 
-static void send_repeat_remove(struct dlm_ls *ls, char *ms_name, int len)
-{
-	char name[DLM_RESNAME_MAXLEN + 1];
-	struct dlm_message *ms;
-	struct dlm_mhandle *mh;
-	struct dlm_rsb *r;
-	uint32_t hash, b;
-	int rv, dir_nodeid;
-
-	memset(name, 0, sizeof(name));
-	memcpy(name, ms_name, len);
-
-	hash = jhash(name, len, 0);
-	b = hash & (ls->ls_rsbtbl_size - 1);
-
-	dir_nodeid = dlm_hash2nodeid(ls, hash);
-
-	log_error(ls, "send_repeat_remove dir %d %s", dir_nodeid, name);
-
-	spin_lock(&ls->ls_rsbtbl[b].lock);
-	rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r);
-	if (!rv) {
-		spin_unlock(&ls->ls_rsbtbl[b].lock);
-		log_error(ls, "repeat_remove on keep %s", name);
-		return;
-	}
-
-	rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
-	if (!rv) {
-		spin_unlock(&ls->ls_rsbtbl[b].lock);
-		log_error(ls, "repeat_remove on toss %s", name);
-		return;
-	}
-
-	/* use ls->remove_name2 to avoid conflict with shrink? */
-
-	spin_lock(&ls->ls_remove_spin);
-	ls->ls_remove_len = len;
-	memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN);
-	spin_unlock(&ls->ls_remove_spin);
-	spin_unlock(&ls->ls_rsbtbl[b].lock);
-
-	rv = _create_message(ls, sizeof(struct dlm_message) + len,
-			     dir_nodeid, DLM_MSG_REMOVE, &ms, &mh);
-	if (rv)
-		goto out;
-
-	memcpy(ms->m_extra, name, len);
-	ms->m_hash = cpu_to_le32(hash);
-
-	send_message(mh, ms);
-
-out:
-	spin_lock(&ls->ls_remove_spin);
-	ls->ls_remove_len = 0;
-	memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN);
-	spin_unlock(&ls->ls_remove_spin);
-	wake_up(&ls->ls_remove_wait);
-}
-
 static int receive_request(struct dlm_ls *ls, struct dlm_message *ms)
 {
 	struct dlm_lkb *lkb;
@@ -4173,25 +4113,11 @@ static int receive_request(struct dlm_ls *ls, struct dlm_message *ms)
 	   ENOTBLK request failures when the lookup reply designating us
 	   as master is delayed. */
 
-	/* We could repeatedly return -EBADR here if our send_remove() is
-	   delayed in being sent/arriving/being processed on the dir node.
-	   Another node would repeatedly lookup up the master, and the dir
-	   node would continue returning our nodeid until our send_remove
-	   took effect.
-
-	   We send another remove message in case our previous send_remove
-	   was lost/ignored/missed somehow. */
-
 	if (error != -ENOTBLK) {
 		log_limit(ls, "receive_request %x from %d %d",
 			  le32_to_cpu(ms->m_lkid), from_nodeid, error);
 	}
 
-	if (namelen && error == -EBADR) {
-		send_repeat_remove(ls, ms->m_extra, namelen);
-		msleep(1000);
-	}
-
 	setup_stub_lkb(ls, ms);
 	send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
 	return error;
-- 
cgit 


From 5b787667e87a373a2f8f70e6be2b5d99c408462f Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Thu, 27 Oct 2022 16:45:14 -0400
Subject: fs: dlm: use packet in dlm_mhandle

To allow more than just dereferencing the inner header we directly point
to the inner dlm packet which allows us to dereference the header, rcom
or message structure.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/midcomms.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index 6489bc22ad61..4eed40d66da1 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -194,7 +194,7 @@ struct midcomms_node {
 };
 
 struct dlm_mhandle {
-	const struct dlm_header *inner_hd;
+	const union dlm_packet *inner_p;
 	struct midcomms_node *node;
 	struct dlm_opts *opts;
 	struct dlm_msg *msg;
@@ -1055,7 +1055,7 @@ static struct dlm_msg *dlm_midcomms_get_msg_3_2(struct dlm_mhandle *mh, int node
 	dlm_fill_opts_header(opts, len, mh->seq);
 
 	*ppc += sizeof(*opts);
-	mh->inner_hd = (const struct dlm_header *)*ppc;
+	mh->inner_p = (const union dlm_packet *)*ppc;
 	return msg;
 }
 
@@ -1133,7 +1133,7 @@ err:
 static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh)
 {
 	/* nexthdr chain for fast lookup */
-	mh->opts->o_nextcmd = mh->inner_hd->h_cmd;
+	mh->opts->o_nextcmd = mh->inner_p->header.h_cmd;
 	mh->committed = true;
 	dlm_lowcomms_commit_msg(mh->msg);
 }
-- 
cgit 


From e01c4b7bd41522ae0299c07e2ee8c721fee02595 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Thu, 27 Oct 2022 16:45:15 -0400
Subject: fd: dlm: trace send/recv of dlm message and rcom

This patch adds tracepoints for send and recv cases of dlm messages and
dlm rcom messages. In case of send and dlm message we add the dlm rsb
resource name this dlm messages belongs to. This has the advantage to
follow dlm messages on a per lock basis. In case of recv message the
resource name can be extracted by follow the send message sequence
number.

The dlm message DLM_MSG_PURGE doesn't belong to a lock request and will
not set the resource name in a dlm_message trace. The same for all rcom
messages.

There is additional handling required for this debugging functionality
which is tried to be small as possible. Also the midcomms layer gets
aware of lock resource names, for now this is required to make a
connection between sequence number and lock resource names. It is for
debugging purpose only.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lock.c     | 21 +++++++++++----------
 fs/dlm/midcomms.c | 45 +++++++++++++++++++++++++++++++++++++++++----
 fs/dlm/midcomms.h |  3 ++-
 fs/dlm/rcom.c     |  4 ++--
 4 files changed, 56 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index b246d71b5e17..0b1bc24536ce 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -3611,9 +3611,10 @@ static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
 /* further lowcomms enhancements or alternate implementations may make
    the return value from this function useful at some point */
 
-static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms)
+static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms,
+			const void *name, int namelen)
 {
-	dlm_midcomms_commit_mhandle(mh);
+	dlm_midcomms_commit_mhandle(mh, name, namelen);
 	return 0;
 }
 
@@ -3679,7 +3680,7 @@ static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
 
 	send_args(r, lkb, ms);
 
-	error = send_message(mh, ms);
+	error = send_message(mh, ms, r->res_name, r->res_length);
 	if (error)
 		goto fail;
 	return 0;
@@ -3742,7 +3743,7 @@ static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb)
 
 	ms->m_result = 0;
 
-	error = send_message(mh, ms);
+	error = send_message(mh, ms, r->res_name, r->res_length);
  out:
 	return error;
 }
@@ -3763,7 +3764,7 @@ static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode)
 
 	ms->m_bastmode = cpu_to_le32(mode);
 
-	error = send_message(mh, ms);
+	error = send_message(mh, ms, r->res_name, r->res_length);
  out:
 	return error;
 }
@@ -3786,7 +3787,7 @@ static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
 
 	send_args(r, lkb, ms);
 
-	error = send_message(mh, ms);
+	error = send_message(mh, ms, r->res_name, r->res_length);
 	if (error)
 		goto fail;
 	return 0;
@@ -3811,7 +3812,7 @@ static int send_remove(struct dlm_rsb *r)
 	memcpy(ms->m_extra, r->res_name, r->res_length);
 	ms->m_hash = cpu_to_le32(r->res_hash);
 
-	error = send_message(mh, ms);
+	error = send_message(mh, ms, r->res_name, r->res_length);
  out:
 	return error;
 }
@@ -3833,7 +3834,7 @@ static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
 
 	ms->m_result = cpu_to_le32(to_dlm_errno(rv));
 
-	error = send_message(mh, ms);
+	error = send_message(mh, ms, r->res_name, r->res_length);
  out:
 	return error;
 }
@@ -3874,7 +3875,7 @@ static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
 	ms->m_result = cpu_to_le32(to_dlm_errno(rv));
 	ms->m_nodeid = cpu_to_le32(ret_nodeid);
 
-	error = send_message(mh, ms);
+	error = send_message(mh, ms, ms_in->m_extra, receive_extralen(ms_in));
  out:
 	return error;
 }
@@ -6300,7 +6301,7 @@ static int send_purge(struct dlm_ls *ls, int nodeid, int pid)
 	ms->m_nodeid = cpu_to_le32(nodeid);
 	ms->m_pid = cpu_to_le32(pid);
 
-	return send_message(mh, ms);
+	return send_message(mh, ms, NULL, 0);
 }
 
 int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc,
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index 4eed40d66da1..e10a6e97df44 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -132,6 +132,7 @@
  */
 #define DLM_DEBUG_FENCE_TERMINATION	0
 
+#include <trace/events/dlm.h>
 #include <net/tcp.h>
 
 #include "dlm_internal.h"
@@ -415,7 +416,7 @@ static int dlm_send_fin(struct midcomms_node *node,
 	m_header->h_cmd = DLM_FIN;
 
 	pr_debug("sending fin msg to node %d\n", node->nodeid);
-	dlm_midcomms_commit_mhandle(mh);
+	dlm_midcomms_commit_mhandle(mh, NULL, 0);
 	set_bit(DLM_NODE_FLAG_STOP_TX, &node->flags);
 
 	return 0;
@@ -474,6 +475,20 @@ static void dlm_pas_fin_ack_rcv(struct midcomms_node *node)
 	spin_unlock(&node->state_lock);
 }
 
+static void dlm_receive_buffer_3_2_trace(uint32_t seq, union dlm_packet *p)
+{
+	switch (p->header.h_cmd) {
+	case DLM_MSG:
+		trace_dlm_recv_message(seq, &p->message);
+		break;
+	case DLM_RCOM:
+		trace_dlm_recv_rcom(seq, &p->rcom);
+		break;
+	default:
+		break;
+	}
+}
+
 static void dlm_midcomms_receive_buffer(union dlm_packet *p,
 					struct midcomms_node *node,
 					uint32_t seq)
@@ -534,6 +549,7 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p,
 			break;
 		default:
 			WARN_ON(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags));
+			dlm_receive_buffer_3_2_trace(seq, p);
 			dlm_receive_buffer(p, node->nodeid);
 			set_bit(DLM_NODE_ULP_DELIVERED, &node->flags);
 			break;
@@ -1130,11 +1146,30 @@ err:
 }
 #endif
 
-static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh)
+static void dlm_midcomms_commit_msg_3_2_trace(const struct dlm_mhandle *mh,
+					      const void *name, int namelen)
+{
+	switch (mh->inner_p->header.h_cmd) {
+	case DLM_MSG:
+		trace_dlm_send_message(mh->seq, &mh->inner_p->message,
+				       name, namelen);
+		break;
+	case DLM_RCOM:
+		trace_dlm_send_rcom(mh->seq, &mh->inner_p->rcom);
+		break;
+	default:
+		/* nothing to trace */
+		break;
+	}
+}
+
+static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh,
+					const void *name, int namelen)
 {
 	/* nexthdr chain for fast lookup */
 	mh->opts->o_nextcmd = mh->inner_p->header.h_cmd;
 	mh->committed = true;
+	dlm_midcomms_commit_msg_3_2_trace(mh, name, namelen);
 	dlm_lowcomms_commit_msg(mh->msg);
 }
 
@@ -1142,8 +1177,10 @@ static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh)
  * dlm_midcomms_get_mhandle
  */
 #ifndef __CHECKER__
-void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh)
+void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh,
+				 const void *name, int namelen)
 {
+
 	switch (mh->node->version) {
 	case DLM_VERSION_3_1:
 		srcu_read_unlock(&nodes_srcu, mh->idx);
@@ -1154,7 +1191,7 @@ void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh)
 		dlm_free_mhandle(mh);
 		break;
 	case DLM_VERSION_3_2:
-		dlm_midcomms_commit_msg_3_2(mh);
+		dlm_midcomms_commit_msg_3_2(mh, name, namelen);
 		srcu_read_unlock(&nodes_srcu, mh->idx);
 		break;
 	default:
diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h
index 82bcd9661922..d6286e80b077 100644
--- a/fs/dlm/midcomms.h
+++ b/fs/dlm/midcomms.h
@@ -17,7 +17,8 @@ struct midcomms_node;
 int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int buflen);
 struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len,
 					     gfp_t allocation, char **ppc);
-void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh);
+void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh, const void *name,
+				 int namelen);
 int dlm_midcomms_close(int nodeid);
 int dlm_midcomms_start(void);
 void dlm_midcomms_shutdown(void);
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index f19860315043..b76d52e2f6bd 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -91,7 +91,7 @@ static int create_rcom_stateless(struct dlm_ls *ls, int to_nodeid, int type,
 
 static void send_rcom(struct dlm_mhandle *mh, struct dlm_rcom *rc)
 {
-	dlm_midcomms_commit_mhandle(mh);
+	dlm_midcomms_commit_mhandle(mh, NULL, 0);
 }
 
 static void send_rcom_stateless(struct dlm_msg *msg, struct dlm_rcom *rc)
@@ -516,7 +516,7 @@ int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
 	rf = (struct rcom_config *) rc->rc_buf;
 	rf->rf_lvblen = cpu_to_le32(~0U);
 
-	dlm_midcomms_commit_mhandle(mh);
+	dlm_midcomms_commit_mhandle(mh, NULL, 0);
 
 	return 0;
 }
-- 
cgit 


From 85839f27b17ddebe21d36a174050420783824ed3 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Thu, 27 Oct 2022 16:45:16 -0400
Subject: fs: dlm: let dlm_add_cb queue work after resume only

We should allow dlm_add_cb() to call queue_work() only after the
recovery queued pending for delayed lkbs. This patch will move the
switch LSFL_CB_DELAY after the delayed lkb work was processed.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/ast.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index d60a8d8f109d..6e07c151ad28 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -308,8 +308,6 @@ void dlm_callback_resume(struct dlm_ls *ls)
 	if (!ls->ls_callback_wq)
 		return;
 
-	clear_bit(LSFL_CB_DELAY, &ls->ls_flags);
-
 more:
 	mutex_lock(&ls->ls_cb_mutex);
 	list_for_each_entry_safe(lkb, safe, &ls->ls_cb_delay, lkb_cb_list) {
@@ -320,6 +318,8 @@ more:
 			break;
 	}
 	empty = list_empty(&ls->ls_cb_delay);
+	if (empty)
+		clear_bit(LSFL_CB_DELAY, &ls->ls_flags);
 	mutex_unlock(&ls->ls_cb_mutex);
 
 	sum += count;
-- 
cgit 


From d3e4dc5d68c8fef719291cc9f3dc907aac494c55 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Thu, 27 Oct 2022 16:45:17 -0400
Subject: fs: dlm: use list_first_entry marco

Instead of using list_entry() this patch moves to using the
list_first_entry() macro.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/user.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index c5d27bccc3dc..6a5de0918a96 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -857,7 +857,7 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
 	   without removing lkb_cb_list; so empty lkb_cb_list is always
 	   consistent with empty lkb_callbacks */
 
-	lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_cb_list);
+	lkb = list_first_entry(&proc->asts, struct dlm_lkb, lkb_cb_list);
 
 	/* rem_lkb_callback sets a new lkb_last_cast */
 	old_mode = lkb->lkb_last_cast.mode;
-- 
cgit 


From a4c0352bb1094cbe242f4458e267de845790737a Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Thu, 27 Oct 2022 16:45:18 -0400
Subject: fs: dlm: convert ls_cb_mutex mutex to spinlock

This patch converts the ls_cb_mutex mutex to a spinlock, there is no
sleepable context when this lock is held.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/ast.c          | 12 ++++++------
 fs/dlm/dlm_internal.h |  2 +-
 fs/dlm/lockspace.c    |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 6e07c151ad28..daaa0dff6ef4 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -200,13 +200,13 @@ void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
 	if (!prev_seq) {
 		kref_get(&lkb->lkb_ref);
 
-		mutex_lock(&ls->ls_cb_mutex);
+		spin_lock(&ls->ls_cb_lock);
 		if (test_bit(LSFL_CB_DELAY, &ls->ls_flags)) {
 			list_add(&lkb->lkb_cb_list, &ls->ls_cb_delay);
 		} else {
 			queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work);
 		}
-		mutex_unlock(&ls->ls_cb_mutex);
+		spin_unlock(&ls->ls_cb_lock);
 	}
  out:
 	mutex_unlock(&lkb->lkb_cb_mutex);
@@ -289,9 +289,9 @@ void dlm_callback_stop(struct dlm_ls *ls)
 void dlm_callback_suspend(struct dlm_ls *ls)
 {
 	if (ls->ls_callback_wq) {
-		mutex_lock(&ls->ls_cb_mutex);
+		spin_lock(&ls->ls_cb_lock);
 		set_bit(LSFL_CB_DELAY, &ls->ls_flags);
-		mutex_unlock(&ls->ls_cb_mutex);
+		spin_unlock(&ls->ls_cb_lock);
 
 		flush_workqueue(ls->ls_callback_wq);
 	}
@@ -309,7 +309,7 @@ void dlm_callback_resume(struct dlm_ls *ls)
 		return;
 
 more:
-	mutex_lock(&ls->ls_cb_mutex);
+	spin_lock(&ls->ls_cb_lock);
 	list_for_each_entry_safe(lkb, safe, &ls->ls_cb_delay, lkb_cb_list) {
 		list_del_init(&lkb->lkb_cb_list);
 		queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work);
@@ -320,7 +320,7 @@ more:
 	empty = list_empty(&ls->ls_cb_delay);
 	if (empty)
 		clear_bit(LSFL_CB_DELAY, &ls->ls_flags);
-	mutex_unlock(&ls->ls_cb_mutex);
+	spin_unlock(&ls->ls_cb_lock);
 
 	sum += count;
 	if (!empty) {
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 94fadb619ba0..fc4be8c35703 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -631,7 +631,7 @@ struct dlm_ls {
 
 	/* recovery related */
 
-	struct mutex		ls_cb_mutex;
+	spinlock_t		ls_cb_lock;
 	struct list_head	ls_cb_delay; /* save for queue_work later */
 	struct timer_list	ls_timer;
 	struct task_struct	*ls_recoverd_task;
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 9479c8110979..4965737705b7 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -567,7 +567,7 @@ static int new_lockspace(const char *name, const char *cluster,
 	init_completion(&ls->ls_recovery_done);
 	ls->ls_recovery_result = -1;
 
-	mutex_init(&ls->ls_cb_mutex);
+	spin_lock_init(&ls->ls_cb_lock);
 	INIT_LIST_HEAD(&ls->ls_cb_delay);
 
 	ls->ls_recoverd_task = NULL;
-- 
cgit 


From 92e95733307e7b6dd352c12fa174089ed51e7208 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Thu, 27 Oct 2022 16:45:19 -0400
Subject: fs: dlm: use spin lock instead of mutex

There is no need to use a mutex in those hot path sections. We change it
to spin lock to serve callbacks more faster by not allowing schedule.
The locked sections will not be locked for a long time.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/ast.c          | 8 ++++----
 fs/dlm/dlm_internal.h | 2 +-
 fs/dlm/lock.c         | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index daaa0dff6ef4..3e76ec75bc55 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -190,7 +190,7 @@ void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
 		return;
 	}
 
-	mutex_lock(&lkb->lkb_cb_mutex);
+	spin_lock(&lkb->lkb_cb_lock);
 	prev_seq = lkb->lkb_callbacks[0].seq;
 
 	rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, new_seq);
@@ -209,7 +209,7 @@ void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
 		spin_unlock(&ls->ls_cb_lock);
 	}
  out:
-	mutex_unlock(&lkb->lkb_cb_mutex);
+	spin_unlock(&lkb->lkb_cb_lock);
 }
 
 void dlm_callback_work(struct work_struct *work)
@@ -223,7 +223,7 @@ void dlm_callback_work(struct work_struct *work)
 
 	memset(&callbacks, 0, sizeof(callbacks));
 
-	mutex_lock(&lkb->lkb_cb_mutex);
+	spin_lock(&lkb->lkb_cb_lock);
 	if (!lkb->lkb_callbacks[0].seq) {
 		/* no callback work exists, shouldn't happen */
 		log_error(ls, "dlm_callback_work %x no work", lkb->lkb_id);
@@ -244,7 +244,7 @@ void dlm_callback_work(struct work_struct *work)
 		dlm_print_lkb(lkb);
 		dlm_dump_lkb_callbacks(lkb);
 	}
-	mutex_unlock(&lkb->lkb_cb_mutex);
+	spin_unlock(&lkb->lkb_cb_lock);
 
 	castfn = lkb->lkb_astfn;
 	bastfn = lkb->lkb_bastfn;
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index fc4be8c35703..730808289a42 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -268,7 +268,7 @@ struct dlm_lkb {
 	unsigned long		lkb_timeout_cs;
 #endif
 
-	struct mutex		lkb_cb_mutex;
+	spinlock_t		lkb_cb_lock;
 	struct work_struct	lkb_cb_work;
 	struct list_head	lkb_cb_list; /* for ls_cb_delay or proc->asts */
 	struct dlm_callback	lkb_callbacks[DLM_CALLBACKS_SIZE];
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 0b1bc24536ce..40e4e4a1c582 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1218,7 +1218,7 @@ static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret,
 	INIT_LIST_HEAD(&lkb->lkb_time_list);
 #endif
 	INIT_LIST_HEAD(&lkb->lkb_cb_list);
-	mutex_init(&lkb->lkb_cb_mutex);
+	spin_lock_init(&lkb->lkb_cb_lock);
 	INIT_WORK(&lkb->lkb_cb_work, dlm_callback_work);
 
 	idr_preload(GFP_NOFS);
-- 
cgit 


From 27d3994ebb5cea9c26f52064a3da8b0e606a8d11 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Thu, 27 Oct 2022 16:45:20 -0400
Subject: fs: dlm: move last cast bast time to function call

This patch moves the debugging information of the last cast and bast
time when calling the last and bast function call.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/ast.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 3e76ec75bc55..8393d2090c1c 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -158,15 +158,11 @@ int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb,
 		}
 	}
 
-	if (cb->flags & DLM_CB_CAST) {
+	if (cb->flags & DLM_CB_CAST)
 		memcpy(&lkb->lkb_last_cast, cb, sizeof(struct dlm_callback));
-		lkb->lkb_last_cast_time = ktime_get();
-	}
 
-	if (cb->flags & DLM_CB_BAST) {
+	if (cb->flags & DLM_CB_BAST)
 		memcpy(&lkb->lkb_last_bast, cb, sizeof(struct dlm_callback));
-		lkb->lkb_last_bast_time = ktime_get();
-	}
 	rv = 0;
  out:
 	return rv;
@@ -256,11 +252,13 @@ void dlm_callback_work(struct work_struct *work)
 			continue;
 		} else if (callbacks[i].flags & DLM_CB_BAST) {
 			trace_dlm_bast(ls, lkb, callbacks[i].mode);
+			lkb->lkb_last_bast_time = ktime_get();
 			bastfn(lkb->lkb_astparam, callbacks[i].mode);
 		} else if (callbacks[i].flags & DLM_CB_CAST) {
 			lkb->lkb_lksb->sb_status = callbacks[i].sb_status;
 			lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags;
 			trace_dlm_ast(ls, lkb);
+			lkb->lkb_last_cast_time = ktime_get();
 			castfn(lkb->lkb_astparam);
 		}
 	}
-- 
cgit 


From 61bed0baa4dba17dd06cdfe20481a580718d6c7c Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Thu, 27 Oct 2022 16:45:21 -0400
Subject: fs: dlm: use a non-static queue for callbacks

This patch will introducde a queue implementation for callbacks by using
the Linux lists. The current callback queue handling is implemented by a
static limit of 6 entries, see DLM_CALLBACKS_SIZE. The sequence number
inside the callback structure was used to see if the entries inside the
static entry is valid or not. We don't need any sequence numbers anymore
with a dynamic datastructure with grows and shrinks during runtime to
offer such functionality.

We assume that every callback will be delivered to the DLM user if once
queued. Therefore the callback flag DLM_CB_SKIP was dropped and the
check for skipping bast was moved before worker handling and not skip
while the callback worker executes. This will reduce unnecessary queues
of the callback worker.

All last callback saves are pointers now and don't need to copied over.
There is a reference counter for callback structures which will care
about to free the callback structures at the right time if they are not
referenced anymore.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/ast.c          | 294 +++++++++++++++++++++-----------------------------
 fs/dlm/ast.h          |  17 ++-
 fs/dlm/debug_fs.c     |   2 +-
 fs/dlm/dlm_internal.h |  15 +--
 fs/dlm/lock.c         |   8 +-
 fs/dlm/memory.c       |  26 +++++
 fs/dlm/memory.h       |   2 +
 fs/dlm/user.c         |  73 +++++++------
 fs/dlm/user.h         |   2 +-
 9 files changed, 222 insertions(+), 217 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 8393d2090c1c..078bbbd43a53 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -12,55 +12,68 @@
 #include <trace/events/dlm.h>
 
 #include "dlm_internal.h"
+#include "memory.h"
 #include "lock.h"
 #include "user.h"
 #include "ast.h"
 
-static uint64_t dlm_cb_seq;
-static DEFINE_SPINLOCK(dlm_cb_seq_spin);
+void dlm_release_callback(struct kref *ref)
+{
+	struct dlm_callback *cb = container_of(ref, struct dlm_callback, ref);
+
+	dlm_free_cb(cb);
+}
+
+void dlm_callback_set_last_ptr(struct dlm_callback **from,
+			       struct dlm_callback *to)
+{
+	if (*from)
+		kref_put(&(*from)->ref, dlm_release_callback);
+
+	if (to)
+		kref_get(&to->ref);
+
+	*from = to;
+}
 
-static void dlm_dump_lkb_callbacks(struct dlm_lkb *lkb)
+void dlm_purge_lkb_callbacks(struct dlm_lkb *lkb)
 {
-	int i;
-
-	log_print("last_bast %x %llu flags %x mode %d sb %d %x",
-		  lkb->lkb_id,
-		  (unsigned long long)lkb->lkb_last_bast.seq,
-		  lkb->lkb_last_bast.flags,
-		  lkb->lkb_last_bast.mode,
-		  lkb->lkb_last_bast.sb_status,
-		  lkb->lkb_last_bast.sb_flags);
-
-	log_print("last_cast %x %llu flags %x mode %d sb %d %x",
-		  lkb->lkb_id,
-		  (unsigned long long)lkb->lkb_last_cast.seq,
-		  lkb->lkb_last_cast.flags,
-		  lkb->lkb_last_cast.mode,
-		  lkb->lkb_last_cast.sb_status,
-		  lkb->lkb_last_cast.sb_flags);
-
-	for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
-		log_print("cb %x %llu flags %x mode %d sb %d %x",
-			  lkb->lkb_id,
-			  (unsigned long long)lkb->lkb_callbacks[i].seq,
-			  lkb->lkb_callbacks[i].flags,
-			  lkb->lkb_callbacks[i].mode,
-			  lkb->lkb_callbacks[i].sb_status,
-			  lkb->lkb_callbacks[i].sb_flags);
+	struct dlm_callback *cb, *safe;
+
+	list_for_each_entry_safe(cb, safe, &lkb->lkb_callbacks, list) {
+		list_del(&cb->list);
+		kref_put(&cb->ref, dlm_release_callback);
 	}
+
+	/* TODO */
+	lkb->lkb_flags &= ~DLM_IFL_NEED_SCHED;
+
+	/* invalidate */
+	dlm_callback_set_last_ptr(&lkb->lkb_last_cast, NULL);
+	dlm_callback_set_last_ptr(&lkb->lkb_last_cb, NULL);
+	lkb->lkb_last_bast_mode = -1;
 }
 
-int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
-			 int status, uint32_t sbflags, uint64_t seq)
+int dlm_enqueue_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
+			     int status, uint32_t sbflags)
 {
 	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
-	uint64_t prev_seq;
+	int rv = DLM_ENQUEUE_CALLBACK_SUCCESS;
+	struct dlm_callback *cb;
 	int prev_mode;
-	int i, rv;
 
-	for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
-		if (lkb->lkb_callbacks[i].seq)
-			continue;
+	if (flags & DLM_CB_BAST) {
+		/* if cb is a bast, it should be skipped if the blocking mode is
+		 * compatible with the last granted mode
+		 */
+		if (lkb->lkb_last_cast) {
+			if (dlm_modes_compat(mode, lkb->lkb_last_cast->mode)) {
+				log_debug(ls, "skip %x bast mode %d for cast mode %d",
+					  lkb->lkb_id, mode,
+					  lkb->lkb_last_cast->mode);
+				goto out;
+			}
+		}
 
 		/*
 		 * Suppress some redundant basts here, do more on removal.
@@ -68,132 +81,75 @@ int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
 		 * is a bast for the same mode or a more restrictive mode.
 		 * (the addional > PR check is needed for PR/CW inversion)
 		 */
-
-		if ((i > 0) && (flags & DLM_CB_BAST) &&
-		    (lkb->lkb_callbacks[i-1].flags & DLM_CB_BAST)) {
-
-			prev_seq = lkb->lkb_callbacks[i-1].seq;
-			prev_mode = lkb->lkb_callbacks[i-1].mode;
+		if (lkb->lkb_last_cb && lkb->lkb_last_cb->flags & DLM_CB_BAST) {
+			prev_mode = lkb->lkb_last_cb->mode;
 
 			if ((prev_mode == mode) ||
 			    (prev_mode > mode && prev_mode > DLM_LOCK_PR)) {
-
-				log_debug(ls, "skip %x add bast %llu mode %d "
-					  "for bast %llu mode %d",
-					  lkb->lkb_id,
-					  (unsigned long long)seq,
-					  mode,
-					  (unsigned long long)prev_seq,
-					  prev_mode);
-				rv = 0;
+				log_debug(ls, "skip %x add bast mode %d for bast mode %d",
+					  lkb->lkb_id, mode, prev_mode);
 				goto out;
 			}
 		}
-
-		lkb->lkb_callbacks[i].seq = seq;
-		lkb->lkb_callbacks[i].flags = flags;
-		lkb->lkb_callbacks[i].mode = mode;
-		lkb->lkb_callbacks[i].sb_status = status;
-		lkb->lkb_callbacks[i].sb_flags = (sbflags & 0x000000FF);
-		rv = 0;
-		break;
 	}
 
-	if (i == DLM_CALLBACKS_SIZE) {
-		log_error(ls, "no callbacks %x %llu flags %x mode %d sb %d %x",
-			  lkb->lkb_id, (unsigned long long)seq,
-			  flags, mode, status, sbflags);
-		dlm_dump_lkb_callbacks(lkb);
-		rv = -1;
+	cb = dlm_allocate_cb();
+	if (!cb) {
+		rv = DLM_ENQUEUE_CALLBACK_FAILURE;
 		goto out;
 	}
- out:
-	return rv;
-}
-
-int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb,
-			 struct dlm_callback *cb, int *resid)
-{
-	int i, rv;
-
-	*resid = 0;
-
-	if (!lkb->lkb_callbacks[0].seq) {
-		rv = -ENOENT;
-		goto out;
-	}
-
-	/* oldest undelivered cb is callbacks[0] */
-
-	memcpy(cb, &lkb->lkb_callbacks[0], sizeof(struct dlm_callback));
-	memset(&lkb->lkb_callbacks[0], 0, sizeof(struct dlm_callback));
 
-	/* shift others down */
-
-	for (i = 1; i < DLM_CALLBACKS_SIZE; i++) {
-		if (!lkb->lkb_callbacks[i].seq)
-			break;
-		memcpy(&lkb->lkb_callbacks[i-1], &lkb->lkb_callbacks[i],
-		       sizeof(struct dlm_callback));
-		memset(&lkb->lkb_callbacks[i], 0, sizeof(struct dlm_callback));
-		(*resid)++;
+	cb->flags = flags;
+	cb->mode = mode;
+	cb->sb_status = status;
+	cb->sb_flags = (sbflags & 0x000000FF);
+	kref_init(&cb->ref);
+	if (!(lkb->lkb_flags & DLM_IFL_NEED_SCHED)) {
+		lkb->lkb_flags |= DLM_IFL_NEED_SCHED;
+		rv = DLM_ENQUEUE_CALLBACK_NEED_SCHED;
 	}
+	list_add_tail(&cb->list, &lkb->lkb_callbacks);
 
-	/* if cb is a bast, it should be skipped if the blocking mode is
-	   compatible with the last granted mode */
-
-	if ((cb->flags & DLM_CB_BAST) && lkb->lkb_last_cast.seq) {
-		if (dlm_modes_compat(cb->mode, lkb->lkb_last_cast.mode)) {
-			cb->flags |= DLM_CB_SKIP;
-
-			log_debug(ls, "skip %x bast %llu mode %d "
-				  "for cast %llu mode %d",
-				  lkb->lkb_id,
-				  (unsigned long long)cb->seq,
-				  cb->mode,
-				  (unsigned long long)lkb->lkb_last_cast.seq,
-				  lkb->lkb_last_cast.mode);
-			rv = 0;
-			goto out;
-		}
-	}
+	if (flags & DLM_CB_CAST)
+		dlm_callback_set_last_ptr(&lkb->lkb_last_cast, cb);
 
-	if (cb->flags & DLM_CB_CAST)
-		memcpy(&lkb->lkb_last_cast, cb, sizeof(struct dlm_callback));
+	dlm_callback_set_last_ptr(&lkb->lkb_last_cb, cb);
 
-	if (cb->flags & DLM_CB_BAST)
-		memcpy(&lkb->lkb_last_bast, cb, sizeof(struct dlm_callback));
-	rv = 0;
  out:
 	return rv;
 }
 
+int dlm_dequeue_lkb_callback(struct dlm_lkb *lkb, struct dlm_callback **cb)
+{
+	/* oldest undelivered cb is callbacks first entry */
+	*cb = list_first_entry_or_null(&lkb->lkb_callbacks,
+				       struct dlm_callback, list);
+	if (!*cb)
+		return DLM_DEQUEUE_CALLBACK_EMPTY;
+
+	/* remove it from callbacks so shift others down */
+	list_del(&(*cb)->list);
+	if (list_empty(&lkb->lkb_callbacks))
+		return DLM_DEQUEUE_CALLBACK_LAST;
+
+	return DLM_DEQUEUE_CALLBACK_SUCCESS;
+}
+
 void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
 		uint32_t sbflags)
 {
 	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
-	uint64_t new_seq, prev_seq;
 	int rv;
 
-	spin_lock(&dlm_cb_seq_spin);
-	new_seq = ++dlm_cb_seq;
-	if (!dlm_cb_seq)
-		new_seq = ++dlm_cb_seq;
-	spin_unlock(&dlm_cb_seq_spin);
-
 	if (lkb->lkb_flags & DLM_IFL_USER) {
-		dlm_user_add_ast(lkb, flags, mode, status, sbflags, new_seq);
+		dlm_user_add_ast(lkb, flags, mode, status, sbflags);
 		return;
 	}
 
 	spin_lock(&lkb->lkb_cb_lock);
-	prev_seq = lkb->lkb_callbacks[0].seq;
-
-	rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, new_seq);
-	if (rv < 0)
-		goto out;
-
-	if (!prev_seq) {
+	rv = dlm_enqueue_lkb_callback(lkb, flags, mode, status, sbflags);
+	switch (rv) {
+	case DLM_ENQUEUE_CALLBACK_NEED_SCHED:
 		kref_get(&lkb->lkb_ref);
 
 		spin_lock(&ls->ls_cb_lock);
@@ -203,8 +159,16 @@ void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
 			queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work);
 		}
 		spin_unlock(&ls->ls_cb_lock);
+		break;
+	case DLM_ENQUEUE_CALLBACK_FAILURE:
+		WARN_ON(1);
+		break;
+	case DLM_ENQUEUE_CALLBACK_SUCCESS:
+		break;
+	default:
+		WARN_ON(1);
+		break;
 	}
- out:
 	spin_unlock(&lkb->lkb_cb_lock);
 }
 
@@ -214,53 +178,43 @@ void dlm_callback_work(struct work_struct *work)
 	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
 	void (*castfn) (void *astparam);
 	void (*bastfn) (void *astparam, int mode);
-	struct dlm_callback callbacks[DLM_CALLBACKS_SIZE];
-	int i, rv, resid;
-
-	memset(&callbacks, 0, sizeof(callbacks));
+	struct dlm_callback *cb;
+	int rv;
 
 	spin_lock(&lkb->lkb_cb_lock);
-	if (!lkb->lkb_callbacks[0].seq) {
-		/* no callback work exists, shouldn't happen */
-		log_error(ls, "dlm_callback_work %x no work", lkb->lkb_id);
-		dlm_print_lkb(lkb);
-		dlm_dump_lkb_callbacks(lkb);
-	}
-
-	for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
-		rv = dlm_rem_lkb_callback(ls, lkb, &callbacks[i], &resid);
-		if (rv < 0)
-			break;
-	}
-
-	if (resid) {
-		/* cbs remain, loop should have removed all, shouldn't happen */
-		log_error(ls, "dlm_callback_work %x resid %d", lkb->lkb_id,
-			  resid);
-		dlm_print_lkb(lkb);
-		dlm_dump_lkb_callbacks(lkb);
-	}
+	rv = dlm_dequeue_lkb_callback(lkb, &cb);
 	spin_unlock(&lkb->lkb_cb_lock);
 
-	castfn = lkb->lkb_astfn;
-	bastfn = lkb->lkb_bastfn;
+	if (WARN_ON(rv == DLM_DEQUEUE_CALLBACK_EMPTY))
+		return;
 
-	for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
-		if (!callbacks[i].seq)
-			break;
-		if (callbacks[i].flags & DLM_CB_SKIP) {
-			continue;
-		} else if (callbacks[i].flags & DLM_CB_BAST) {
-			trace_dlm_bast(ls, lkb, callbacks[i].mode);
+	for (;;) {
+		castfn = lkb->lkb_astfn;
+		bastfn = lkb->lkb_bastfn;
+
+		if (cb->flags & DLM_CB_BAST) {
+			trace_dlm_bast(ls, lkb, cb->mode);
 			lkb->lkb_last_bast_time = ktime_get();
-			bastfn(lkb->lkb_astparam, callbacks[i].mode);
-		} else if (callbacks[i].flags & DLM_CB_CAST) {
-			lkb->lkb_lksb->sb_status = callbacks[i].sb_status;
-			lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags;
+			lkb->lkb_last_bast_mode = cb->mode;
+			bastfn(lkb->lkb_astparam, cb->mode);
+		} else if (cb->flags & DLM_CB_CAST) {
+			lkb->lkb_lksb->sb_status = cb->sb_status;
+			lkb->lkb_lksb->sb_flags = cb->sb_flags;
 			trace_dlm_ast(ls, lkb);
 			lkb->lkb_last_cast_time = ktime_get();
 			castfn(lkb->lkb_astparam);
 		}
+
+		kref_put(&cb->ref, dlm_release_callback);
+
+		spin_lock(&lkb->lkb_cb_lock);
+		rv = dlm_dequeue_lkb_callback(lkb, &cb);
+		if (rv == DLM_DEQUEUE_CALLBACK_EMPTY) {
+			lkb->lkb_flags &= ~DLM_IFL_NEED_SCHED;
+			spin_unlock(&lkb->lkb_cb_lock);
+			break;
+		}
+		spin_unlock(&lkb->lkb_cb_lock);
 	}
 
 	/* undo kref_get from dlm_add_callback, may cause lkb to be freed */
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
index e5e05fcc5813..880b11882495 100644
--- a/fs/dlm/ast.h
+++ b/fs/dlm/ast.h
@@ -11,13 +11,22 @@
 #ifndef __ASTD_DOT_H__
 #define __ASTD_DOT_H__
 
-int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
-                         int status, uint32_t sbflags, uint64_t seq);
-int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb,
-                         struct dlm_callback *cb, int *resid);
+#define DLM_ENQUEUE_CALLBACK_NEED_SCHED	1
+#define DLM_ENQUEUE_CALLBACK_SUCCESS	0
+#define DLM_ENQUEUE_CALLBACK_FAILURE	-1
+int dlm_enqueue_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
+			     int status, uint32_t sbflags);
+#define DLM_DEQUEUE_CALLBACK_EMPTY	2
+#define DLM_DEQUEUE_CALLBACK_LAST	1
+#define DLM_DEQUEUE_CALLBACK_SUCCESS	0
+int dlm_dequeue_lkb_callback(struct dlm_lkb *lkb, struct dlm_callback **cb);
 void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
                 uint32_t sbflags);
+void dlm_callback_set_last_ptr(struct dlm_callback **from,
+			       struct dlm_callback *to);
 
+void dlm_release_callback(struct kref *ref);
+void dlm_purge_lkb_callbacks(struct dlm_lkb *lkb);
 void dlm_callback_work(struct work_struct *work);
 int dlm_callback_start(struct dlm_ls *ls);
 void dlm_callback_stop(struct dlm_ls *ls);
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 8fb04ebbafb5..8a0e1b1f74ad 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -246,7 +246,7 @@ static void print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
 		   lkb->lkb_status,
 		   lkb->lkb_grmode,
 		   lkb->lkb_rqmode,
-		   lkb->lkb_last_bast.mode,
+		   lkb->lkb_last_bast_mode,
 		   rsb_lookup,
 		   lkb->lkb_wait_type,
 		   lkb->lkb_lvbseq,
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 730808289a42..69e3928c756b 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -211,6 +211,7 @@ struct dlm_args {
 #endif
 #define DLM_IFL_DEADLOCK_CANCEL	0x01000000
 #define DLM_IFL_STUB_MS		0x02000000 /* magic number for m_flags */
+#define DLM_IFL_NEED_SCHED	0x04000000
 /* least significant 2 bytes are message changed, they are full transmitted
  * but at receive side only the 2 bytes LSB will be set.
  *
@@ -222,18 +223,17 @@ struct dlm_args {
 #define DLM_IFL_USER		0x00000001
 #define DLM_IFL_ORPHAN		0x00000002
 
-#define DLM_CALLBACKS_SIZE	6
-
 #define DLM_CB_CAST		0x00000001
 #define DLM_CB_BAST		0x00000002
-#define DLM_CB_SKIP		0x00000004
 
 struct dlm_callback {
-	uint64_t		seq;
 	uint32_t		flags;		/* DLM_CBF_ */
 	int			sb_status;	/* copy to lksb status */
 	uint8_t			sb_flags;	/* copy to lksb flags */
 	int8_t			mode; /* rq mode of bast, gr mode of cast */
+
+	struct list_head	list;
+	struct kref		ref;
 };
 
 struct dlm_lkb {
@@ -271,9 +271,10 @@ struct dlm_lkb {
 	spinlock_t		lkb_cb_lock;
 	struct work_struct	lkb_cb_work;
 	struct list_head	lkb_cb_list; /* for ls_cb_delay or proc->asts */
-	struct dlm_callback	lkb_callbacks[DLM_CALLBACKS_SIZE];
-	struct dlm_callback	lkb_last_cast;
-	struct dlm_callback	lkb_last_bast;
+	struct list_head	lkb_callbacks;
+	struct dlm_callback	*lkb_last_cast;
+	struct dlm_callback	*lkb_last_cb;
+	int			lkb_last_bast_mode;
 	ktime_t			lkb_last_cast_time;	/* for debugging */
 	ktime_t			lkb_last_bast_time;	/* for debugging */
 
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 40e4e4a1c582..f4b2fb17bbb1 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1209,6 +1209,7 @@ static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret,
 	if (!lkb)
 		return -ENOMEM;
 
+	lkb->lkb_last_bast_mode = -1;
 	lkb->lkb_nodeid = -1;
 	lkb->lkb_grmode = DLM_LOCK_IV;
 	kref_init(&lkb->lkb_ref);
@@ -1218,6 +1219,7 @@ static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret,
 	INIT_LIST_HEAD(&lkb->lkb_time_list);
 #endif
 	INIT_LIST_HEAD(&lkb->lkb_cb_list);
+	INIT_LIST_HEAD(&lkb->lkb_callbacks);
 	spin_lock_init(&lkb->lkb_cb_lock);
 	INIT_WORK(&lkb->lkb_cb_work, dlm_callback_work);
 
@@ -6221,8 +6223,7 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
 	}
 
 	list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_cb_list) {
-		memset(&lkb->lkb_callbacks, 0,
-		       sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE);
+		dlm_purge_lkb_callbacks(lkb);
 		list_del_init(&lkb->lkb_cb_list);
 		dlm_put_lkb(lkb);
 	}
@@ -6263,8 +6264,7 @@ static void purge_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
 
 	spin_lock(&proc->asts_spin);
 	list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_cb_list) {
-		memset(&lkb->lkb_callbacks, 0,
-		       sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE);
+		dlm_purge_lkb_callbacks(lkb);
 		list_del_init(&lkb->lkb_cb_list);
 		dlm_put_lkb(lkb);
 	}
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index ce35c3c19aeb..61fe0d1f5646 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -14,12 +14,14 @@
 #include "lowcomms.h"
 #include "config.h"
 #include "memory.h"
+#include "ast.h"
 
 static struct kmem_cache *writequeue_cache;
 static struct kmem_cache *mhandle_cache;
 static struct kmem_cache *msg_cache;
 static struct kmem_cache *lkb_cache;
 static struct kmem_cache *rsb_cache;
+static struct kmem_cache *cb_cache;
 
 
 int __init dlm_memory_init(void)
@@ -46,8 +48,16 @@ int __init dlm_memory_init(void)
 	if (!rsb_cache)
 		goto rsb;
 
+	cb_cache = kmem_cache_create("dlm_cb", sizeof(struct dlm_callback),
+				     __alignof__(struct dlm_callback), 0,
+				     NULL);
+	if (!rsb_cache)
+		goto cb;
+
 	return 0;
 
+cb:
+	kmem_cache_destroy(rsb_cache);
 rsb:
 	kmem_cache_destroy(msg_cache);
 msg:
@@ -67,6 +77,7 @@ void dlm_memory_exit(void)
 	kmem_cache_destroy(msg_cache);
 	kmem_cache_destroy(lkb_cache);
 	kmem_cache_destroy(rsb_cache);
+	kmem_cache_destroy(cb_cache);
 }
 
 char *dlm_allocate_lvb(struct dlm_ls *ls)
@@ -115,6 +126,11 @@ void dlm_free_lkb(struct dlm_lkb *lkb)
 			kfree(ua);
 		}
 	}
+
+	/* drop references if they are set */
+	dlm_callback_set_last_ptr(&lkb->lkb_last_cast, NULL);
+	dlm_callback_set_last_ptr(&lkb->lkb_last_cb, NULL);
+
 	kmem_cache_free(lkb_cache, lkb);
 }
 
@@ -147,3 +163,13 @@ void dlm_free_msg(struct dlm_msg *msg)
 {
 	kmem_cache_free(msg_cache, msg);
 }
+
+struct dlm_callback *dlm_allocate_cb(void)
+{
+	return kmem_cache_alloc(cb_cache, GFP_ATOMIC);
+}
+
+void dlm_free_cb(struct dlm_callback *cb)
+{
+	kmem_cache_free(cb_cache, cb);
+}
diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h
index 7bd3f1a391ca..c1583ec8b2cf 100644
--- a/fs/dlm/memory.h
+++ b/fs/dlm/memory.h
@@ -26,6 +26,8 @@ struct writequeue_entry *dlm_allocate_writequeue(void);
 void dlm_free_writequeue(struct writequeue_entry *writequeue);
 struct dlm_msg *dlm_allocate_msg(gfp_t allocation);
 void dlm_free_msg(struct dlm_msg *msg);
+struct dlm_callback *dlm_allocate_cb(void);
+void dlm_free_cb(struct dlm_callback *cb);
 
 #endif		/* __MEMORY_DOT_H__ */
 
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 6a5de0918a96..6b530db4bc0b 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -25,6 +25,7 @@
 #include "user.h"
 #include "ast.h"
 #include "config.h"
+#include "memory.h"
 
 static const char name_prefix[] = "dlm";
 static const struct file_operations device_fops;
@@ -175,7 +176,7 @@ static int lkb_is_endoflife(int mode, int status)
    being removed and then remove that lkb from the orphans list and free it */
 
 void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
-		      int status, uint32_t sbflags, uint64_t seq)
+		      int status, uint32_t sbflags)
 {
 	struct dlm_ls *ls;
 	struct dlm_user_args *ua;
@@ -209,16 +210,22 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
 
 	spin_lock(&proc->asts_spin);
 
-	rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, seq);
-	if (rv < 0) {
+	rv = dlm_enqueue_lkb_callback(lkb, flags, mode, status, sbflags);
+	switch (rv) {
+	case DLM_ENQUEUE_CALLBACK_FAILURE:
 		spin_unlock(&proc->asts_spin);
+		WARN_ON(1);
 		goto out;
-	}
-
-	if (list_empty(&lkb->lkb_cb_list)) {
+	case DLM_ENQUEUE_CALLBACK_NEED_SCHED:
 		kref_get(&lkb->lkb_ref);
 		list_add_tail(&lkb->lkb_cb_list, &proc->asts);
 		wake_up_interruptible(&proc->wait);
+		break;
+	case DLM_ENQUEUE_CALLBACK_SUCCESS:
+		break;
+	default:
+		WARN_ON(1);
+		break;
 	}
 	spin_unlock(&proc->asts_spin);
 
@@ -800,8 +807,8 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
 	struct dlm_user_proc *proc = file->private_data;
 	struct dlm_lkb *lkb;
 	DECLARE_WAITQUEUE(wait, current);
-	struct dlm_callback cb;
-	int rv, resid, copy_lvb = 0;
+	struct dlm_callback *cb;
+	int rv, copy_lvb = 0;
 	int old_mode, new_mode;
 
 	if (count == sizeof(struct dlm_device_version)) {
@@ -860,50 +867,56 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
 	lkb = list_first_entry(&proc->asts, struct dlm_lkb, lkb_cb_list);
 
 	/* rem_lkb_callback sets a new lkb_last_cast */
-	old_mode = lkb->lkb_last_cast.mode;
+	old_mode = lkb->lkb_last_cast->mode;
 
-	rv = dlm_rem_lkb_callback(lkb->lkb_resource->res_ls, lkb, &cb, &resid);
-	if (rv < 0) {
+	rv = dlm_dequeue_lkb_callback(lkb, &cb);
+	switch (rv) {
+	case DLM_DEQUEUE_CALLBACK_EMPTY:
 		/* this shouldn't happen; lkb should have been removed from
-		   list when resid was zero */
+		 * list when last item was dequeued
+		 */
 		log_print("dlm_rem_lkb_callback empty %x", lkb->lkb_id);
 		list_del_init(&lkb->lkb_cb_list);
 		spin_unlock(&proc->asts_spin);
 		/* removes ref for proc->asts, may cause lkb to be freed */
 		dlm_put_lkb(lkb);
+		WARN_ON(1);
 		goto try_another;
-	}
-	if (!resid)
+	case DLM_DEQUEUE_CALLBACK_LAST:
 		list_del_init(&lkb->lkb_cb_list);
-	spin_unlock(&proc->asts_spin);
-
-	if (cb.flags & DLM_CB_SKIP) {
-		/* removes ref for proc->asts, may cause lkb to be freed */
-		if (!resid)
-			dlm_put_lkb(lkb);
-		goto try_another;
+		/* TODO */
+		lkb->lkb_flags &= ~DLM_IFL_NEED_SCHED;
+		break;
+	case DLM_DEQUEUE_CALLBACK_SUCCESS:
+		break;
+	default:
+		WARN_ON(1);
+		break;
 	}
+	spin_unlock(&proc->asts_spin);
 
-	if (cb.flags & DLM_CB_BAST) {
-		trace_dlm_bast(lkb->lkb_resource->res_ls, lkb, cb.mode);
-	} else if (cb.flags & DLM_CB_CAST) {
-		new_mode = cb.mode;
+	if (cb->flags & DLM_CB_BAST) {
+		trace_dlm_bast(lkb->lkb_resource->res_ls, lkb, cb->mode);
+	} else if (cb->flags & DLM_CB_CAST) {
+		new_mode = cb->mode;
 
-		if (!cb.sb_status && lkb->lkb_lksb->sb_lvbptr &&
+		if (!cb->sb_status && lkb->lkb_lksb->sb_lvbptr &&
 		    dlm_lvb_operations[old_mode + 1][new_mode + 1])
 			copy_lvb = 1;
 
-		lkb->lkb_lksb->sb_status = cb.sb_status;
-		lkb->lkb_lksb->sb_flags = cb.sb_flags;
+		lkb->lkb_lksb->sb_status = cb->sb_status;
+		lkb->lkb_lksb->sb_flags = cb->sb_flags;
 		trace_dlm_ast(lkb->lkb_resource->res_ls, lkb);
 	}
 
 	rv = copy_result_to_user(lkb->lkb_ua,
 				 test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
-				 cb.flags, cb.mode, copy_lvb, buf, count);
+				 cb->flags, cb->mode, copy_lvb, buf, count);
+
+	kref_put(&cb->ref, dlm_release_callback);
 
 	/* removes ref for proc->asts, may cause lkb to be freed */
-	if (!resid)
+	if (rv == DLM_DEQUEUE_CALLBACK_LAST)
 		dlm_put_lkb(lkb);
 
 	return rv;
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
index 6b9bce6b96e0..33059452d79e 100644
--- a/fs/dlm/user.h
+++ b/fs/dlm/user.h
@@ -7,7 +7,7 @@
 #define __USER_DOT_H__
 
 void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
-                      int status, uint32_t sbflags, uint64_t seq);
+		      int status, uint32_t sbflags);
 int dlm_user_init(void);
 void dlm_user_exit(void);
 int dlm_device_deregister(struct dlm_ls *ls);
-- 
cgit 


From e1711fe3fd59fa1e34e02add2e9c188441c12021 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Thu, 27 Oct 2022 16:45:22 -0400
Subject: fs: dlm: allow different allocation context per _create_message

This patch allows to give the use control about the allocation context
based on a per message basis. Currently all messages forced to be
created under GFP_NOFS context.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lock.c     | 31 +++++++++++++++++++------------
 fs/dlm/memory.c   |  4 ++--
 fs/dlm/memory.h   |  2 +-
 fs/dlm/midcomms.c |  2 +-
 4 files changed, 23 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index f4b2fb17bbb1..a2930e33c134 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -3554,7 +3554,8 @@ int dlm_unlock(dlm_lockspace_t *lockspace,
 static int _create_message(struct dlm_ls *ls, int mb_len,
 			   int to_nodeid, int mstype,
 			   struct dlm_message **ms_ret,
-			   struct dlm_mhandle **mh_ret)
+			   struct dlm_mhandle **mh_ret,
+			   gfp_t allocation)
 {
 	struct dlm_message *ms;
 	struct dlm_mhandle *mh;
@@ -3564,7 +3565,7 @@ static int _create_message(struct dlm_ls *ls, int mb_len,
 	   pass into midcomms_commit and a message buffer (mb) that we
 	   write our data into */
 
-	mh = dlm_midcomms_get_mhandle(to_nodeid, mb_len, GFP_NOFS, &mb);
+	mh = dlm_midcomms_get_mhandle(to_nodeid, mb_len, allocation, &mb);
 	if (!mh)
 		return -ENOBUFS;
 
@@ -3586,7 +3587,8 @@ static int _create_message(struct dlm_ls *ls, int mb_len,
 static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
 			  int to_nodeid, int mstype,
 			  struct dlm_message **ms_ret,
-			  struct dlm_mhandle **mh_ret)
+			  struct dlm_mhandle **mh_ret,
+			  gfp_t allocation)
 {
 	int mb_len = sizeof(struct dlm_message);
 
@@ -3607,7 +3609,7 @@ static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
 	}
 
 	return _create_message(r->res_ls, mb_len, to_nodeid, mstype,
-			       ms_ret, mh_ret);
+			       ms_ret, mh_ret, allocation);
 }
 
 /* further lowcomms enhancements or alternate implementations may make
@@ -3676,7 +3678,7 @@ static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
 	if (error)
 		return error;
 
-	error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
+	error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh, GFP_NOFS);
 	if (error)
 		goto fail;
 
@@ -3737,7 +3739,8 @@ static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb)
 
 	to_nodeid = lkb->lkb_nodeid;
 
-	error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh);
+	error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh,
+			       GFP_NOFS);
 	if (error)
 		goto out;
 
@@ -3758,7 +3761,8 @@ static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode)
 
 	to_nodeid = lkb->lkb_nodeid;
 
-	error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh);
+	error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh,
+			       GFP_NOFS);
 	if (error)
 		goto out;
 
@@ -3783,7 +3787,8 @@ static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
 	if (error)
 		return error;
 
-	error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
+	error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh,
+			       GFP_NOFS);
 	if (error)
 		goto fail;
 
@@ -3807,7 +3812,8 @@ static int send_remove(struct dlm_rsb *r)
 
 	to_nodeid = dlm_dir_nodeid(r);
 
-	error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh);
+	error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh,
+			       GFP_NOFS);
 	if (error)
 		goto out;
 
@@ -3828,7 +3834,7 @@ static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
 
 	to_nodeid = lkb->lkb_nodeid;
 
-	error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
+	error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh, GFP_NOFS);
 	if (error)
 		goto out;
 
@@ -3869,7 +3875,8 @@ static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
 	struct dlm_mhandle *mh;
 	int error, nodeid = le32_to_cpu(ms_in->m_header.h_nodeid);
 
-	error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh);
+	error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh,
+			       GFP_NOFS);
 	if (error)
 		goto out;
 
@@ -6295,7 +6302,7 @@ static int send_purge(struct dlm_ls *ls, int nodeid, int pid)
 	int error;
 
 	error = _create_message(ls, sizeof(struct dlm_message), nodeid,
-				DLM_MSG_PURGE, &ms, &mh);
+				DLM_MSG_PURGE, &ms, &mh, GFP_NOFS);
 	if (error)
 		return error;
 	ms->m_nodeid = cpu_to_le32(nodeid);
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index 61fe0d1f5646..eb7a08641fcf 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -134,9 +134,9 @@ void dlm_free_lkb(struct dlm_lkb *lkb)
 	kmem_cache_free(lkb_cache, lkb);
 }
 
-struct dlm_mhandle *dlm_allocate_mhandle(void)
+struct dlm_mhandle *dlm_allocate_mhandle(gfp_t allocation)
 {
-	return kmem_cache_alloc(mhandle_cache, GFP_NOFS);
+	return kmem_cache_alloc(mhandle_cache, allocation);
 }
 
 void dlm_free_mhandle(struct dlm_mhandle *mhandle)
diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h
index c1583ec8b2cf..6b29563d24f7 100644
--- a/fs/dlm/memory.h
+++ b/fs/dlm/memory.h
@@ -20,7 +20,7 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls);
 void dlm_free_lkb(struct dlm_lkb *l);
 char *dlm_allocate_lvb(struct dlm_ls *ls);
 void dlm_free_lvb(char *l);
-struct dlm_mhandle *dlm_allocate_mhandle(void);
+struct dlm_mhandle *dlm_allocate_mhandle(gfp_t allocation);
 void dlm_free_mhandle(struct dlm_mhandle *mhandle);
 struct writequeue_entry *dlm_allocate_writequeue(void);
 void dlm_free_writequeue(struct writequeue_entry *writequeue);
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index e10a6e97df44..3a4e20d6cd2d 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -1097,7 +1097,7 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len,
 	/* this is a bug, however we going on and hope it will be resolved */
 	WARN_ON(test_bit(DLM_NODE_FLAG_STOP_TX, &node->flags));
 
-	mh = dlm_allocate_mhandle();
+	mh = dlm_allocate_mhandle(allocation);
 	if (!mh)
 		goto err;
 
-- 
cgit 


From 3872f87b09e2f274ecf477895f9d1f9b9bdcf04b Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Thu, 27 Oct 2022 16:45:23 -0400
Subject: fs: dlm: remove ls_remove_wait waitqueue

This patch removes the ls_remove_wait waitqueue handling. The current
handling tries to wait before a lookup is send out for a identically
resource name which is going to be removed. Hereby the remove message
should be send out before the new lookup message. The reason is that
after a lookup request and response will actually use the specific
remote rsb. A followed remove message would delete the rsb on the remote
side but it's still being used.

To reach a similar behaviour we simple send the remove message out while
the rsb lookup lock is held and the rsb is removed from the toss list.
Other find_rsb() calls would never have the change to get a rsb back to
live while a remove message will be send out (without holding the lock).

This behaviour requires a non-sleepable context which should be provided
now and might be the reason why it was not implemented so in the first
place.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/dlm_internal.h |  4 ----
 fs/dlm/lock.c         | 56 ++-------------------------------------------------
 fs/dlm/lockspace.c    |  3 ---
 3 files changed, 2 insertions(+), 61 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 69e3928c756b..6318e0f51bc9 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -592,11 +592,7 @@ struct dlm_ls {
 	int			ls_new_rsb_count;
 	struct list_head	ls_new_rsb;	/* new rsb structs */
 
-	spinlock_t		ls_remove_spin;
-	wait_queue_head_t	ls_remove_wait;
-	char			ls_remove_name[DLM_RESNAME_MAXLEN+1];
 	char			*ls_remove_names[DLM_REMOVE_NAMES_MAX];
-	int			ls_remove_len;
 	int			ls_remove_lens[DLM_REMOVE_NAMES_MAX];
 
 	struct list_head	ls_nodes;	/* current nodes in ls */
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index a2930e33c134..e1adfa5aed05 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1589,37 +1589,6 @@ static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms)
 	return error;
 }
 
-/* If there's an rsb for the same resource being removed, ensure
- * that the remove message is sent before the new lookup message.
- */
-
-#define DLM_WAIT_PENDING_COND(ls, r)		\
-	(ls->ls_remove_len &&			\
-	 !rsb_cmp(r, ls->ls_remove_name,	\
-		  ls->ls_remove_len))
-
-static void wait_pending_remove(struct dlm_rsb *r)
-{
-	struct dlm_ls *ls = r->res_ls;
- restart:
-	spin_lock(&ls->ls_remove_spin);
-	if (DLM_WAIT_PENDING_COND(ls, r)) {
-		log_debug(ls, "delay lookup for remove dir %d %s",
-			  r->res_dir_nodeid, r->res_name);
-		spin_unlock(&ls->ls_remove_spin);
-		wait_event(ls->ls_remove_wait, !DLM_WAIT_PENDING_COND(ls, r));
-		goto restart;
-	}
-	spin_unlock(&ls->ls_remove_spin);
-}
-
-/*
- * ls_remove_spin protects ls_remove_name and ls_remove_len which are
- * read by other threads in wait_pending_remove.  ls_remove_names
- * and ls_remove_lens are only used by the scan thread, so they do
- * not need protection.
- */
-
 static void shrink_bucket(struct dlm_ls *ls, int b)
 {
 	struct rb_node *n, *next;
@@ -1701,11 +1670,6 @@ static void shrink_bucket(struct dlm_ls *ls, int b)
 	 * list and sending the removal.  Keeping this gap small is
 	 * important to keep us (the master node) from being out of sync
 	 * with the remote dir node for very long.
-	 *
-	 * From the time the rsb is removed from toss until just after
-	 * send_remove, the rsb name is saved in ls_remove_name.  A new
-	 * lookup checks this to ensure that a new lookup message for the
-	 * same resource name is not sent just before the remove message.
 	 */
 
 	for (i = 0; i < remote_count; i++) {
@@ -1752,22 +1716,8 @@ static void shrink_bucket(struct dlm_ls *ls, int b)
 		}
 
 		rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
-
-		/* block lookup of same name until we've sent remove */
-		spin_lock(&ls->ls_remove_spin);
-		ls->ls_remove_len = len;
-		memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN);
-		spin_unlock(&ls->ls_remove_spin);
-		spin_unlock(&ls->ls_rsbtbl[b].lock);
-
 		send_remove(r);
-
-		/* allow lookup of name again */
-		spin_lock(&ls->ls_remove_spin);
-		ls->ls_remove_len = 0;
-		memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN);
-		spin_unlock(&ls->ls_remove_spin);
-		wake_up(&ls->ls_remove_wait);
+		spin_unlock(&ls->ls_rsbtbl[b].lock);
 
 		dlm_free_rsb(r);
 	}
@@ -2718,8 +2668,6 @@ static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
 		return 0;
 	}
 
-	wait_pending_remove(r);
-
 	r->res_first_lkid = lkb->lkb_id;
 	send_lookup(r, lkb);
 	return 1;
@@ -3813,7 +3761,7 @@ static int send_remove(struct dlm_rsb *r)
 	to_nodeid = dlm_dir_nodeid(r);
 
 	error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh,
-			       GFP_NOFS);
+			       GFP_ATOMIC);
 	if (error)
 		goto out;
 
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 4965737705b7..e7135883cf78 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -524,9 +524,6 @@ static int new_lockspace(const char *name, const char *cluster,
 		spin_lock_init(&ls->ls_rsbtbl[i].lock);
 	}
 
-	spin_lock_init(&ls->ls_remove_spin);
-	init_waitqueue_head(&ls->ls_remove_wait);
-
 	for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++) {
 		ls->ls_remove_names[i] = kzalloc(DLM_RESNAME_MAXLEN+1,
 						 GFP_KERNEL);
-- 
cgit 


From 194a3fb488f2760eda67c3ab0ce3a095e8006d72 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Thu, 27 Oct 2022 16:45:24 -0400
Subject: fs: dlm: relax sending to allow receiving

This patch drops additionally the sock_mutex when there is a sending
message burst. Since we have acknowledge handling we free sending
buffers only when we receive an ack back, but if we are stuck in
send_to_sock() looping because dlm sends a lot of messages and we never
leave the loop the sending buffer fill up very quickly. We can't receive
during this iteration because the sock_mutex is held. This patch will
unlock the sock_mutex so it should be possible to receive messages when
a burst of sending messages happens. This will allow to free up memory
because acks which are already received can be processed.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 871d4e9f49fb..b05c6d9b5102 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1418,7 +1418,10 @@ static void send_to_sock(struct connection *con)
 	const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
 	struct writequeue_entry *e;
 	int len, offset, ret;
-	int count = 0;
+	int count;
+
+again:
+	count = 0;
 
 	mutex_lock(&con->sock_mutex);
 	if (con->sock == NULL)
@@ -1453,14 +1456,16 @@ static void send_to_sock(struct connection *con)
 		} else if (ret < 0)
 			goto out;
 
+		spin_lock(&con->writequeue_lock);
+		writequeue_entry_complete(e, ret);
+
 		/* Don't starve people filling buffers */
 		if (++count >= MAX_SEND_MSG_COUNT) {
+			spin_unlock(&con->writequeue_lock);
+			mutex_unlock(&con->sock_mutex);
 			cond_resched();
-			count = 0;
+			goto again;
 		}
-
-		spin_lock(&con->writequeue_lock);
-		writequeue_entry_complete(e, ret);
 	}
 	spin_unlock(&con->writequeue_lock);
 
-- 
cgit 


From 9c693d76abb3956433a28994924046abf1a73ef4 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Thu, 27 Oct 2022 16:45:25 -0400
Subject: fs: dlm: catch dlm_add_member() error

This patch will catch a possible dlm_add_member() and delivers it to the
dlm recovery handling.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/member.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index 2af2ccfe43a9..923c01a8a0aa 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -573,7 +573,10 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
 		node = &rv->nodes[i];
 		if (dlm_is_member(ls, node->nodeid))
 			continue;
-		dlm_add_member(ls, node);
+		error = dlm_add_member(ls, node);
+		if (error)
+			return error;
+
 		log_rinfo(ls, "add member %d", node->nodeid);
 	}
 
-- 
cgit 


From 3e54c9e80e68b765d8877023d93f1eea1b9d1c54 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Thu, 27 Oct 2022 16:45:26 -0400
Subject: fs: dlm: fix log of lowcomms vs midcomms

This patch will fix a small issue when printing out that
dlm_midcomms_start() failed to start and it was printing out that the
dlm subcomponent lowcomms was failed but lowcomms is behind the midcomms
layer.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lockspace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index e7135883cf78..96fe8f7ca09c 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -391,7 +391,7 @@ static int threads_start(void)
 	/* Thread for sending/receiving messages for all lockspace's */
 	error = dlm_midcomms_start();
 	if (error) {
-		log_print("cannot start dlm lowcomms %d", error);
+		log_print("cannot start dlm midcomms %d", error);
 		goto scand_fail;
 	}
 
-- 
cgit 


From 775af207464bd28a2086f8399c0b2a3f1f40c7ae Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Thu, 27 Oct 2022 16:45:27 -0400
Subject: fs: dlm: use WARN_ON_ONCE() instead of WARN_ON()

To not get the console spammed about WARN_ON() of invalid states in the
dlm midcomms hot path handling we switch to WARN_ON_ONCE() to get it
only once that there might be an issue with the midcomms state handling.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/midcomms.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index 3a4e20d6cd2d..32194a750fe1 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -469,7 +469,7 @@ static void dlm_pas_fin_ack_rcv(struct midcomms_node *node)
 		spin_unlock(&node->state_lock);
 		log_print("%s: unexpected state: %d\n",
 			  __func__, node->state);
-		WARN_ON(1);
+		WARN_ON_ONCE(1);
 		return;
 	}
 	spin_unlock(&node->state_lock);
@@ -540,7 +540,7 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p,
 				spin_unlock(&node->state_lock);
 				log_print("%s: unexpected state: %d\n",
 					  __func__, node->state);
-				WARN_ON(1);
+				WARN_ON_ONCE(1);
 				return;
 			}
 			spin_unlock(&node->state_lock);
@@ -548,7 +548,7 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p,
 			set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags);
 			break;
 		default:
-			WARN_ON(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags));
+			WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags));
 			dlm_receive_buffer_3_2_trace(seq, p);
 			dlm_receive_buffer(p, node->nodeid);
 			set_bit(DLM_NODE_ULP_DELIVERED, &node->flags);
@@ -770,7 +770,7 @@ static void dlm_midcomms_receive_buffer_3_2(union dlm_packet *p, int nodeid)
 			goto out;
 		}
 
-		WARN_ON(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags));
+		WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags));
 		dlm_receive_buffer(p, nodeid);
 		break;
 	case DLM_OPTS:
@@ -1095,7 +1095,7 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len,
 	}
 
 	/* this is a bug, however we going on and hope it will be resolved */
-	WARN_ON(test_bit(DLM_NODE_FLAG_STOP_TX, &node->flags));
+	WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_TX, &node->flags));
 
 	mh = dlm_allocate_mhandle(allocation);
 	if (!mh)
@@ -1127,7 +1127,7 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len,
 		break;
 	default:
 		dlm_free_mhandle(mh);
-		WARN_ON(1);
+		WARN_ON_ONCE(1);
 		goto err;
 	}
 
@@ -1196,7 +1196,7 @@ void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh,
 		break;
 	default:
 		srcu_read_unlock(&nodes_srcu, mh->idx);
-		WARN_ON(1);
+		WARN_ON_ONCE(1);
 		break;
 	}
 }
@@ -1238,7 +1238,7 @@ static void dlm_act_fin_ack_rcv(struct midcomms_node *node)
 		spin_unlock(&node->state_lock);
 		log_print("%s: unexpected state: %d\n",
 			  __func__, node->state);
-		WARN_ON(1);
+		WARN_ON_ONCE(1);
 		return;
 	}
 	spin_unlock(&node->state_lock);
@@ -1356,7 +1356,7 @@ static void midcomms_node_release(struct rcu_head *rcu)
 {
 	struct midcomms_node *node = container_of(rcu, struct midcomms_node, rcu);
 
-	WARN_ON(atomic_read(&node->send_queue_cnt));
+	WARN_ON_ONCE(atomic_read(&node->send_queue_cnt));
 	kfree(node);
 }
 
-- 
cgit 


From 149562f7509404c382c32c3fa8a6ba356135e5cf Mon Sep 17 00:00:00 2001
From: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Date: Thu, 22 Sep 2022 10:42:05 -0500
Subject: mm/hugetlb: add hugetlb_folio_subpool() helpers

Allow hugetlbfs_migrate_folio to check and read subpool information by
passing in a folio.

Link: https://lkml.kernel.org/r/20220922154207.1575343-4-sidhartha.kumar@oracle.com
Signed-off-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Colin Cross <ccross@google.com>
Cc: David Howells <dhowells@redhat.com>
Cc: "Eric W . Biederman" <ebiederm@xmission.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: kernel test robot <lkp@intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: William Kucharski <william.kucharski@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/hugetlbfs/inode.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index dd54f67e47fd..c5137607e523 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1091,10 +1091,10 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping,
 	if (rc != MIGRATEPAGE_SUCCESS)
 		return rc;
 
-	if (hugetlb_page_subpool(&src->page)) {
-		hugetlb_set_page_subpool(&dst->page,
-					hugetlb_page_subpool(&src->page));
-		hugetlb_set_page_subpool(&src->page, NULL);
+	if (hugetlb_folio_subpool(src)) {
+		hugetlb_set_folio_subpool(dst,
+					hugetlb_folio_subpool(src));
+		hugetlb_set_folio_subpool(src, NULL);
 	}
 
 	if (mode != MIGRATE_SYNC_NO_COPY)
-- 
cgit 


From ece62684dcfb714b7d8452056b4a33d426b16457 Mon Sep 17 00:00:00 2001
From: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Date: Thu, 22 Sep 2022 10:42:06 -0500
Subject: hugetlbfs: convert hugetlb_delete_from_page_cache() to use folios

Remove the last caller of delete_from_page_cache() by converting the code
to its folio equivalent.

Link: https://lkml.kernel.org/r/20220922154207.1575343-5-sidhartha.kumar@oracle.com
Signed-off-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Colin Cross <ccross@google.com>
Cc: David Howells <dhowells@redhat.com>
Cc: "Eric W . Biederman" <ebiederm@xmission.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: William Kucharski <william.kucharski@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/hugetlbfs/inode.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index c5137607e523..00495fc128c5 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -364,11 +364,11 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
 	return -EINVAL;
 }
 
-static void hugetlb_delete_from_page_cache(struct page *page)
+static void hugetlb_delete_from_page_cache(struct folio *folio)
 {
-	ClearPageDirty(page);
-	ClearPageUptodate(page);
-	delete_from_page_cache(page);
+	folio_clear_dirty(folio);
+	folio_clear_uptodate(folio);
+	filemap_remove_folio(folio);
 }
 
 /*
@@ -574,8 +574,8 @@ static bool remove_inode_single_folio(struct hstate *h, struct inode *inode,
 	 * map could fail.  Correspondingly, the subpool and global
 	 * reserve usage count can need to be adjusted.
 	 */
-	VM_BUG_ON(HPageRestoreReserve(&folio->page));
-	hugetlb_delete_from_page_cache(&folio->page);
+	VM_BUG_ON_FOLIO(folio_test_hugetlb_restore_reserve(folio), folio);
+	hugetlb_delete_from_page_cache(folio);
 	ret = true;
 	if (!truncate_op) {
 		if (unlikely(hugetlb_unreserve_pages(inode, index,
-- 
cgit 


From 5d89c224328bce791d051bf60aa92d90bae93c01 Mon Sep 17 00:00:00 2001
From: Liu Shixin <liushixin2@huawei.com>
Date: Fri, 23 Sep 2022 11:33:41 +0800
Subject: fs/proc/kcore.c: use hotplug_memory_notifier() directly

Commit 76ae847497bc52 ("Documentation: raise minimum supported version of
GCC to 5.1") updated the minimum gcc version to 5.1.  So the problem
mentioned in f02c69680088 ("include/linux/memory.h: implement
register_hotmemory_notifier()") no longer exist.  So we can now switch to
use hotplug_memory_notifier() directly rather than
register_hotmemory_notifier().

Link: https://lkml.kernel.org/r/20220923033347.3935160-3-liushixin2@huawei.com
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Waiman Long <longman@redhat.com>
Cc: zefan li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/kcore.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index dff921f7ca33..7692a360972d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -18,7 +18,6 @@
 #include <linux/capability.h>
 #include <linux/elf.h>
 #include <linux/elfcore.h>
-#include <linux/notifier.h>
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
 #include <linux/printk.h>
@@ -638,10 +637,6 @@ static int __meminit kcore_callback(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block kcore_callback_nb __meminitdata = {
-	.notifier_call = kcore_callback,
-	.priority = 0,
-};
 
 static struct kcore_list kcore_vmalloc;
 
@@ -694,7 +689,7 @@ static int __init proc_kcore_init(void)
 	add_modules_range();
 	/* Store direct-map area from physical memory map */
 	kcore_update_ram();
-	register_hotmemory_notifier(&kcore_callback_nb);
+	hotplug_memory_notifier(kcore_callback, 0);
 
 	return 0;
 }
-- 
cgit 


From 1eeaa4fd39b0b1b3e986f8eab6978e69b01e3c5e Mon Sep 17 00:00:00 2001
From: Liu Shixin <liushixin2@huawei.com>
Date: Fri, 23 Sep 2022 11:33:47 +0800
Subject: memory: move hotplug memory notifier priority to same file for easy
 sorting

The priority of hotplug memory callback is defined in a different file.
And there are some callers using numbers directly.  Collect them together
into include/linux/memory.h for easy reading.  This allows us to sort
their priorities more intuitively without additional comments.

Link: https://lkml.kernel.org/r/20220923033347.3935160-9-liushixin2@huawei.com
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Waiman Long <longman@redhat.com>
Cc: zefan li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/kcore.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 7692a360972d..98f3289556e4 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -689,7 +689,7 @@ static int __init proc_kcore_init(void)
 	add_modules_range();
 	/* Store direct-map area from physical memory map */
 	kcore_update_ram();
-	hotplug_memory_notifier(kcore_callback, 0);
+	hotplug_memory_notifier(kcore_callback, DEFAULT_CALLBACK_PRI);
 
 	return 0;
 }
-- 
cgit 


From e025ab842ec35225b1a8e163d1f311beb9e38ce9 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Tue, 18 Oct 2022 15:40:14 +0800
Subject: mm: remove kern_addr_valid() completely

Most architectures (except arm64/x86/sparc) simply return 1 for
kern_addr_valid(), which is only used in read_kcore(), and it calls
copy_from_kernel_nofault() which could check whether the address is a
valid kernel address.  So as there is no need for kern_addr_valid(), let's
remove it.

Link: https://lkml.kernel.org/r/20221018074014.185687-1-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>	[m68k]
Acked-by: Heiko Carstens <hca@linux.ibm.com>		[s390]
Acked-by: Christoph Hellwig <hch@lst.de>
Acked-by: Helge Deller <deller@gmx.de>			[parisc]
Acked-by: Michael Ellerman <mpe@ellerman.id.au>		[powerpc]
Acked-by: Guo Ren <guoren@kernel.org>			[csky]
Acked-by: Catalin Marinas <catalin.marinas@arm.com>	[arm64]
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Cc: <aou@eecs.berkeley.edu>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Chris Zankel <chris@zankel.net>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Greg Ungerer <gerg@linux-m68k.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@rivosinc.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rich Felker <dalias@libc.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Stefan Kristiansson <stefan.kristiansson@saunalahti.fi>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vineet Gupta <vgupta@kernel.org>
Cc: Will Deacon <will@kernel.org>
Cc: Xuerui Wang <kernel@xen0n.name>
Cc: Yoshinori Sato <ysato@users.osdn.me>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/kcore.c | 26 +++++++++-----------------
 1 file changed, 9 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 98f3289556e4..71157ee35c1a 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -540,25 +540,17 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 			fallthrough;
 		case KCORE_VMEMMAP:
 		case KCORE_TEXT:
-			if (kern_addr_valid(start)) {
-				/*
-				 * Using bounce buffer to bypass the
-				 * hardened user copy kernel text checks.
-				 */
-				if (copy_from_kernel_nofault(buf, (void *)start,
-						tsz)) {
-					if (clear_user(buffer, tsz)) {
-						ret = -EFAULT;
-						goto out;
-					}
-				} else {
-					if (copy_to_user(buffer, buf, tsz)) {
-						ret = -EFAULT;
-						goto out;
-					}
+			/*
+			 * Using bounce buffer to bypass the
+			 * hardened user copy kernel text checks.
+			 */
+			if (copy_from_kernel_nofault(buf, (void *)start, tsz)) {
+				if (clear_user(buffer, tsz)) {
+					ret = -EFAULT;
+					goto out;
 				}
 			} else {
-				if (clear_user(buffer, tsz)) {
+				if (copy_to_user(buffer, buf, tsz)) {
 					ret = -EFAULT;
 					goto out;
 				}
-- 
cgit 


From 26215b7ee923b9251f7bb12c4e5f09dc465d35f2 Mon Sep 17 00:00:00 2001
From: Hawkins Jiawei <yin31149@gmail.com>
Date: Fri, 21 Oct 2022 07:16:08 +0800
Subject: hugetlbfs: fix null-ptr-deref in hugetlbfs_parse_param()

Syzkaller reports a null-ptr-deref bug as follows:
======================================================
KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007]
RIP: 0010:hugetlbfs_parse_param+0x1dd/0x8e0 fs/hugetlbfs/inode.c:1380
[...]
Call Trace:
 <TASK>
 vfs_parse_fs_param fs/fs_context.c:148 [inline]
 vfs_parse_fs_param+0x1f9/0x3c0 fs/fs_context.c:129
 vfs_parse_fs_string+0xdb/0x170 fs/fs_context.c:191
 generic_parse_monolithic+0x16f/0x1f0 fs/fs_context.c:231
 do_new_mount fs/namespace.c:3036 [inline]
 path_mount+0x12de/0x1e20 fs/namespace.c:3370
 do_mount fs/namespace.c:3383 [inline]
 __do_sys_mount fs/namespace.c:3591 [inline]
 __se_sys_mount fs/namespace.c:3568 [inline]
 __x64_sys_mount+0x27f/0x300 fs/namespace.c:3568
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x63/0xcd
 [...]
 </TASK>
======================================================

According to commit "vfs: parse: deal with zero length string value",
kernel will set the param->string to null pointer in vfs_parse_fs_string()
if fs string has zero length.

Yet the problem is that, hugetlbfs_parse_param() will dereference the
param->string, without checking whether it is a null pointer.  To be more
specific, if hugetlbfs_parse_param() parses an illegal mount parameter,
such as "size=,", kernel will constructs struct fs_parameter with null
pointer in vfs_parse_fs_string(), then passes this struct fs_parameter to
hugetlbfs_parse_param(), which triggers the above null-ptr-deref bug.

This patch solves it by adding sanity check on param->string
in hugetlbfs_parse_param().

Link: https://lkml.kernel.org/r/20221020231609.4810-1-yin31149@gmail.com
Reported-by: syzbot+a3e6acd85ded5c16a709@syzkaller.appspotmail.com
Tested-by: syzbot+a3e6acd85ded5c16a709@syzkaller.appspotmail.com
  Link: https://lore.kernel.org/all/0000000000005ad00405eb7148c6@google.com/
Signed-off-by: Hawkins Jiawei <yin31149@gmail.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Hawkins Jiawei <yin31149@gmail.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/hugetlbfs/inode.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 00495fc128c5..09e644f80a4a 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1378,7 +1378,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
 
 	case Opt_size:
 		/* memparse() will accept a K/M/G without a digit */
-		if (!isdigit(param->string[0]))
+		if (!param->string || !isdigit(param->string[0]))
 			goto bad_val;
 		ctx->max_size_opt = memparse(param->string, &rest);
 		ctx->max_val_type = SIZE_STD;
@@ -1388,7 +1388,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
 
 	case Opt_nr_inodes:
 		/* memparse() will accept a K/M/G without a digit */
-		if (!isdigit(param->string[0]))
+		if (!param->string || !isdigit(param->string[0]))
 			goto bad_val;
 		ctx->nr_inodes = memparse(param->string, &rest);
 		return 0;
@@ -1404,7 +1404,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
 
 	case Opt_min_size:
 		/* memparse() will accept a K/M/G without a digit */
-		if (!isdigit(param->string[0]))
+		if (!param->string || !isdigit(param->string[0]))
 			goto bad_val;
 		ctx->min_size_opt = memparse(param->string, &rest);
 		ctx->min_val_type = SIZE_STD;
-- 
cgit 


From d0e482c45c50117bfb568825a41f0693e5f33c0f Mon Sep 17 00:00:00 2001
From: Oleg Kanatov <okanatov@gmail.com>
Date: Fri, 28 Oct 2022 15:16:39 +0300
Subject: jfs: Fix a typo in function jfs_umount

When closing the block allocation map, an incorrect pointer
was NULL'ed. This commit fixes that.

Signed-off-by: Oleg Kanatov <okanatov@gmail.com>
Signed-off-by: Dave Kleikamp <dave.kleikamp@oracle.com>
---
 fs/jfs/jfs_umount.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c
index 95ebcd17ce75..8ec43f53f686 100644
--- a/fs/jfs/jfs_umount.c
+++ b/fs/jfs/jfs_umount.c
@@ -87,7 +87,7 @@ int jfs_umount(struct super_block *sb)
 	dbUnmount(ipbmap, 0);
 
 	diFreeSpecial(ipbmap);
-	sbi->ipimap = NULL;
+	sbi->ipbmap = NULL;
 
 	/*
 	 * Make sure all metadata makes it to disk before we mark
-- 
cgit 


From a60dca73a1a8079d867b2c2e9549440346c1ba83 Mon Sep 17 00:00:00 2001
From: Oleg Kanatov <okanatov@gmail.com>
Date: Fri, 28 Oct 2022 15:22:54 +0300
Subject: jfs: makes diUnmount/diMount in jfs_mount_rw atomic

jfs_mount_rw can call diUnmount and then diMount. These calls change the
imap pointer. Between these two calls there may be calls of function
jfs_lookup(). The jfs_lookup() function calls jfs_iget(), which, in turn
calls diRead(). The latter references the imap pointer. That may cause
diRead() to refer to a pointer freed in diUnmount().  This commit makes
the calls to diUnmount()/diMount() atomic so that nothing will read the
imap pointer until the whole remount is completed.

Signed-off-by: Oleg Kanatov <okanatov@gmail.com>
Signed-off-by: Dave Kleikamp <dave.kleikamp@oracle.com>
---
 fs/jfs/jfs_imap.c  | 2 +-
 fs/jfs/jfs_mount.c | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 799d3837e7c2..390cbfce391f 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -310,8 +310,8 @@ int diRead(struct inode *ip)
 	iagno = INOTOIAG(ip->i_ino);
 
 	/* read the iag */
-	imap = JFS_IP(ipimap)->i_imap;
 	IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
+	imap = JFS_IP(ipimap)->i_imap;
 	rc = diIAGRead(imap, iagno, &mp);
 	IREAD_UNLOCK(ipimap);
 	if (rc) {
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index 48d1f70f786c..b83aae56a1f2 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -234,11 +234,15 @@ int jfs_mount_rw(struct super_block *sb, int remount)
 
 		truncate_inode_pages(sbi->ipimap->i_mapping, 0);
 		truncate_inode_pages(sbi->ipbmap->i_mapping, 0);
+
+		IWRITE_LOCK(sbi->ipimap, RDWRLOCK_IMAP);
 		diUnmount(sbi->ipimap, 1);
 		if ((rc = diMount(sbi->ipimap))) {
+			IWRITE_UNLOCK(sbi->ipimap);
 			jfs_err("jfs_mount_rw: diMount failed!");
 			return rc;
 		}
+		IWRITE_UNLOCK(sbi->ipimap);
 
 		dbUnmount(sbi->ipbmap, 1);
 		if ((rc = dbMount(sbi->ipbmap))) {
-- 
cgit 


From d030bd1a66580b7b198a085e7220c794bcdc2770 Mon Sep 17 00:00:00 2001
From: Bo Liu <liubo03@inspur.com>
Date: Fri, 11 Nov 2022 01:43:52 -0500
Subject: ext2: Fix some kernel-doc warnings

The current code provokes some kernel-doc warnings:
fs/ext2/dir.c:417: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst

Signed-off-by: Bo Liu <liubo03@inspur.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext2/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 8f597753ac12..27873733ed8c 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -413,7 +413,7 @@ found:
 	return de;
 }
 
-/**
+/*
  * Return the '..' directory entry and the page in which the entry was found
  * (as a parameter - p).
  *
-- 
cgit 


From e6ecb142429183cef4835f31d4134050ae660032 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Tue, 8 Nov 2022 17:59:34 -0800
Subject: f2fs: allow to read node block after shutdown

If block address is still alive, we should give a valid node block even after
shutdown. Otherwise, we can see zero data when reading out a file.

Cc: stable@vger.kernel.org
Fixes: 83a3bfdb5a8a ("f2fs: indicate shutdown f2fs to allow unmount successfully")
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/node.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 983572f23896..b9ee5a1176a0 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1360,8 +1360,7 @@ static int read_node_page(struct page *page, blk_opf_t op_flags)
 		return err;
 
 	/* NEW_ADDR can be seen, after cp_error drops some dirty node pages */
-	if (unlikely(ni.blk_addr == NULL_ADDR || ni.blk_addr == NEW_ADDR) ||
-			is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN)) {
+	if (unlikely(ni.blk_addr == NULL_ADDR || ni.blk_addr == NEW_ADDR)) {
 		ClearPageUptodate(page);
 		return -ENOENT;
 	}
-- 
cgit 


From 225d6795abf47c3340214ca1b4c22728e463db4f Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Tue, 25 Oct 2022 21:26:38 +0800
Subject: f2fs: add proc entry to show discard_plist info

This patch adds a new proc entry to show discard_plist
information in more detail, which is very helpful to
know the discard pend list count clearly.

Such as:

Discard pend list(Show diacrd_cmd count on each entry, .:not exist):
  0       390     156      85      67      46      37      26      14
  8        17      12       9       9       6      12      11      10
  16        5       9       2       4       8       3       4       1
  24        3       2       2       5       2       4       5       4
  32        3       3       2       3       .       3       3       1
  40        .       4       1       3       2       1       2       1
  48        1       .       1       1       .       1       1       .
  56        .       1       1       1       .       2       .       1
  64        1       2       .       .       .       .       .       .
  72        .       1       .       .       .       .       .       .
  80        3       1       .       .       1       1       .       .
  88        1       .       .       .       1       .       .       1
......

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/sysfs.c | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

(limited to 'fs')

diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 032c03e09580..97bf0dbb0974 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -1252,6 +1252,44 @@ static int __maybe_unused victim_bits_seq_show(struct seq_file *seq,
 	return 0;
 }
 
+static int __maybe_unused discard_plist_seq_show(struct seq_file *seq,
+						void *offset)
+{
+	struct super_block *sb = seq->private;
+	struct f2fs_sb_info *sbi = F2FS_SB(sb);
+	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+	int i, count;
+
+	seq_puts(seq, "Discard pend list(Show diacrd_cmd count on each entry, .:not exist):\n");
+	if (!f2fs_realtime_discard_enable(sbi))
+		return 0;
+
+	if (dcc) {
+		mutex_lock(&dcc->cmd_lock);
+		for (i = 0; i < MAX_PLIST_NUM; i++) {
+			struct list_head *pend_list;
+			struct discard_cmd *dc, *tmp;
+
+			if (i % 8 == 0)
+				seq_printf(seq, "  %-3d", i);
+			count = 0;
+			pend_list = &dcc->pend_list[i];
+			list_for_each_entry_safe(dc, tmp, pend_list, list)
+				count++;
+			if (count)
+				seq_printf(seq, " %7d", count);
+			else
+				seq_puts(seq, "       .");
+			if (i % 8 == 7)
+				seq_putc(seq, '\n');
+		}
+		seq_putc(seq, '\n');
+		mutex_unlock(&dcc->cmd_lock);
+	}
+
+	return 0;
+}
+
 int __init f2fs_init_sysfs(void)
 {
 	int ret;
@@ -1322,6 +1360,8 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi)
 #endif
 		proc_create_single_data("victim_bits", 0444, sbi->s_proc,
 				victim_bits_seq_show, sb);
+		proc_create_single_data("discard_plist_info", 0444, sbi->s_proc,
+				discard_plist_seq_show, sb);
 	}
 	return 0;
 put_feature_list_kobj:
@@ -1345,6 +1385,7 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi)
 		remove_proc_entry("segment_info", sbi->s_proc);
 		remove_proc_entry("segment_bits", sbi->s_proc);
 		remove_proc_entry("victim_bits", sbi->s_proc);
+		remove_proc_entry("discard_plist_info", sbi->s_proc);
 		remove_proc_entry(sbi->sb->s_id, f2fs_proc_root);
 	}
 
-- 
cgit 


From 4d8d45df2252980f800c1b2fde941a103a18a70e Mon Sep 17 00:00:00 2001
From: Daeho Jeong <daehojeong@google.com>
Date: Mon, 31 Oct 2022 12:24:15 -0700
Subject: f2fs: correct i_size change for atomic writes

We need to make sure i_size doesn't change until atomic write commit is
successful and restore it when commit is failed.

Signed-off-by: Daeho Jeong <daehojeong@google.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h    |  8 ++++++++
 fs/f2fs/file.c    | 18 +++++++++++-------
 fs/f2fs/inode.c   |  5 ++++-
 fs/f2fs/segment.c | 14 ++++++++++----
 4 files changed, 33 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 04ef4cce3d7f..11c475beca2c 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -768,6 +768,7 @@ enum {
 	FI_COMPRESS_RELEASED,	/* compressed blocks were released */
 	FI_ALIGNED_WRITE,	/* enable aligned write */
 	FI_COW_FILE,		/* indicate COW file */
+	FI_ATOMIC_COMMITTED,	/* indicate atomic commit completed except disk sync */
 	FI_MAX,			/* max flag, never be used */
 };
 
@@ -826,6 +827,7 @@ struct f2fs_inode_info {
 	unsigned int i_cluster_size;		/* cluster size */
 
 	unsigned int atomic_write_cnt;
+	loff_t original_i_size;		/* original i_size before atomic write */
 };
 
 static inline void get_extent_info(struct extent_info *ext,
@@ -3075,6 +3077,8 @@ static inline void f2fs_i_blocks_write(struct inode *inode,
 		set_inode_flag(inode, FI_AUTO_RECOVER);
 }
 
+static inline bool f2fs_is_atomic_file(struct inode *inode);
+
 static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size)
 {
 	bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE);
@@ -3084,6 +3088,10 @@ static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size)
 		return;
 
 	i_size_write(inode, i_size);
+
+	if (f2fs_is_atomic_file(inode))
+		return;
+
 	f2fs_mark_inode_dirty_sync(inode, true);
 	if (clean || recover)
 		set_inode_flag(inode, FI_AUTO_RECOVER);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index c605a4f2bce2..28f586e77999 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -2041,6 +2041,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct inode *pinode;
+	loff_t isize;
 	int ret;
 
 	if (!inode_owner_or_capable(mnt_userns, inode))
@@ -2099,7 +2100,12 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
 		f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
 		goto out;
 	}
-	f2fs_i_size_write(fi->cow_inode, i_size_read(inode));
+
+	f2fs_write_inode(inode, NULL);
+
+	isize = i_size_read(inode);
+	fi->original_i_size = isize;
+	f2fs_i_size_write(fi->cow_inode, isize);
 
 	stat_inc_atomic_inode(inode);
 
@@ -2137,16 +2143,14 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
 
 	if (f2fs_is_atomic_file(inode)) {
 		ret = f2fs_commit_atomic_write(inode);
-		if (ret)
-			goto unlock_out;
-
-		ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
 		if (!ret)
-			f2fs_abort_atomic_write(inode, false);
+			ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
+
+		f2fs_abort_atomic_write(inode, ret);
 	} else {
 		ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false);
 	}
-unlock_out:
+
 	inode_unlock(inode);
 	mnt_drop_write_file(filp);
 	return ret;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 9f0d3864d9f1..577f109b4e1d 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -621,9 +621,12 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
 	ri->i_uid = cpu_to_le32(i_uid_read(inode));
 	ri->i_gid = cpu_to_le32(i_gid_read(inode));
 	ri->i_links = cpu_to_le32(inode->i_nlink);
-	ri->i_size = cpu_to_le64(i_size_read(inode));
 	ri->i_blocks = cpu_to_le64(SECTOR_TO_BLOCK(inode->i_blocks) + 1);
 
+	if (!f2fs_is_atomic_file(inode) ||
+			is_inode_flag_set(inode, FI_ATOMIC_COMMITTED))
+		ri->i_size = cpu_to_le64(i_size_read(inode));
+
 	if (et) {
 		read_lock(&et->lock);
 		set_raw_extent(&et->largest, &ri->i_ext);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index aa4be7f25963..8aa81238c770 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -192,14 +192,18 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean)
 	if (!f2fs_is_atomic_file(inode))
 		return;
 
-	if (clean)
-		truncate_inode_pages_final(inode->i_mapping);
 	clear_inode_flag(fi->cow_inode, FI_COW_FILE);
 	iput(fi->cow_inode);
 	fi->cow_inode = NULL;
 	release_atomic_write_cnt(inode);
+	clear_inode_flag(inode, FI_ATOMIC_COMMITTED);
 	clear_inode_flag(inode, FI_ATOMIC_FILE);
 	stat_dec_atomic_inode(inode);
+
+	if (clean) {
+		truncate_inode_pages_final(inode->i_mapping);
+		f2fs_i_size_write(inode, fi->original_i_size);
+	}
 }
 
 static int __replace_atomic_write_block(struct inode *inode, pgoff_t index,
@@ -335,10 +339,12 @@ next:
 	}
 
 out:
-	if (ret)
+	if (ret) {
 		sbi->revoked_atomic_block += fi->atomic_write_cnt;
-	else
+	} else {
 		sbi->committed_atomic_block += fi->atomic_write_cnt;
+		set_inode_flag(inode, FI_ATOMIC_COMMITTED);
+	}
 
 	__complete_revoke_list(inode, &revoke_list, ret ? true : false);
 
-- 
cgit 


From cc249e4cba9a6002c9d9e1438daf8440a160bc9e Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Sun, 6 Nov 2022 21:25:44 +0800
Subject: f2fs: fix to avoid accessing uninitialized spinlock

syzbot reports a kernel bug:

 __dump_stack lib/dump_stack.c:88 [inline]
 dump_stack_lvl+0x1e3/0x2cb lib/dump_stack.c:106
 assign_lock_key+0x22a/0x240 kernel/locking/lockdep.c:981
 register_lock_class+0x287/0x9b0 kernel/locking/lockdep.c:1294
 __lock_acquire+0xe4/0x1f60 kernel/locking/lockdep.c:4934
 lock_acquire+0x1a7/0x400 kernel/locking/lockdep.c:5668
 __raw_spin_lock include/linux/spinlock_api_smp.h:133 [inline]
 _raw_spin_lock+0x2a/0x40 kernel/locking/spinlock.c:154
 spin_lock include/linux/spinlock.h:350 [inline]
 f2fs_save_errors fs/f2fs/super.c:3868 [inline]
 f2fs_handle_error+0x29/0x230 fs/f2fs/super.c:3896
 f2fs_iget+0x215/0x4bb0 fs/f2fs/inode.c:516
 f2fs_fill_super+0x47d3/0x7b50 fs/f2fs/super.c:4222
 mount_bdev+0x26c/0x3a0 fs/super.c:1401
 legacy_get_tree+0xea/0x180 fs/fs_context.c:610
 vfs_get_tree+0x88/0x270 fs/super.c:1531
 do_new_mount+0x289/0xad0 fs/namespace.c:3040
 do_mount fs/namespace.c:3383 [inline]
 __do_sys_mount fs/namespace.c:3591 [inline]
 __se_sys_mount+0x2e3/0x3d0 fs/namespace.c:3568
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x2b/0x70 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x63/0xcd

F2FS-fs (loop1): Failed to read F2FS meta data inode

The root cause is if sbi->error_lock may be accessed before
its initialization, fix it.

Link: https://lore.kernel.org/linux-f2fs-devel/0000000000007edb6605ecbb6442@google.com/T/#u
Reported-by: syzbot+40642be9b7e0bb28e0df@syzkaller.appspotmail.com
Fixes: 95fa90c9e5a7 ("f2fs: support recording errors into superblock")
Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index a43d8a46a6e5..68a6c2eedcac 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -4196,6 +4196,9 @@ try_onemore:
 	if (err)
 		goto free_bio_info;
 
+	spin_lock_init(&sbi->error_lock);
+	memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS);
+
 	init_f2fs_rwsem(&sbi->cp_rwsem);
 	init_f2fs_rwsem(&sbi->quota_sem);
 	init_waitqueue_head(&sbi->cp_wait);
@@ -4263,9 +4266,6 @@ try_onemore:
 		goto free_devices;
 	}
 
-	spin_lock_init(&sbi->error_lock);
-	memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS);
-
 	sbi->total_valid_node_count =
 				le32_to_cpu(sbi->ckpt->valid_node_count);
 	percpu_counter_set(&sbi->total_valid_inode_count,
-- 
cgit 


From 59237a21776f70ffb0420611c23e7158e1317037 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Tue, 8 Nov 2022 22:33:21 +0800
Subject: f2fs: optimize iteration over sparse directories

Wei Chen reports a kernel bug as blew:

INFO: task syz-executor.0:29056 blocked for more than 143 seconds.
      Not tainted 5.15.0-rc5 #1
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:syz-executor.0  state:D stack:14632 pid:29056 ppid:  6574 flags:0x00000004
Call Trace:
 __schedule+0x4a1/0x1720
 schedule+0x36/0xe0
 rwsem_down_write_slowpath+0x322/0x7a0
 fscrypt_ioctl_set_policy+0x11f/0x2a0
 __f2fs_ioctl+0x1a9f/0x5780
 f2fs_ioctl+0x89/0x3a0
 __x64_sys_ioctl+0xe8/0x140
 do_syscall_64+0x34/0xb0
 entry_SYSCALL_64_after_hwframe+0x44/0xae

Eric did some investigation on this issue, quoted from reply of Eric:

"Well, the quality of this bug report has a lot to be desired (not on
upstream kernel, reproducer is full of totally irrelevant stuff, not
sent to the mailing list of the filesystem whose disk image is being
fuzzed, etc.).  But what is going on is that f2fs_empty_dir() doesn't
consider the case of a directory with an extremely large i_size on a
malicious disk image.

Specifically, the reproducer mounts an f2fs image with a directory
that has an i_size of 14814520042850357248, then calls
FS_IOC_SET_ENCRYPTION_POLICY on it.

That results in a call to f2fs_empty_dir() to check whether the
directory is empty.  f2fs_empty_dir() then iterates through all
3616826182336513 blocks the directory allegedly contains to check
whether any contain anything.  i_rwsem is held during this, so
anything else that tries to take it will hang."

In order to solve this issue, let's use f2fs_get_next_page_offset()
to speed up iteration by skipping holes for all below functions:
- f2fs_empty_dir
- f2fs_readdir
- find_in_level

The way why we can speed up iteration was described in
'commit 3cf4574705b4 ("f2fs: introduce get_next_page_offset to speed
up SEEK_DATA")'.

Meanwhile, in f2fs_empty_dir(), let's use f2fs_find_data_page()
instead f2fs_get_lock_data_page(), due to i_rwsem was held in
caller of f2fs_empty_dir(), there shouldn't be any races, so it's
fine to not lock dentry page during lookuping dirents in the page.

Link: https://lore.kernel.org/lkml/536944df-a0ae-1dd8-148f-510b476e1347@kernel.org/T/
Reported-by: Wei Chen <harperchen1110@gmail.com>
Cc: Eric Biggers <ebiggers@google.com>
Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 17 ++++++++++++-----
 fs/f2fs/dir.c  | 34 ++++++++++++++++++++++++----------
 fs/f2fs/f2fs.h |  5 +++--
 fs/f2fs/gc.c   |  4 ++--
 4 files changed, 41 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index a71e818cd67b..9b47ded653d1 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1206,7 +1206,8 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index)
 }
 
 struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
-				     blk_opf_t op_flags, bool for_write)
+				     blk_opf_t op_flags, bool for_write,
+				     pgoff_t *next_pgofs)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct dnode_of_data dn;
@@ -1232,12 +1233,17 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
 
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
 	err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE);
-	if (err)
+	if (err) {
+		if (err == -ENOENT && next_pgofs)
+			*next_pgofs = f2fs_get_next_page_offset(&dn, index);
 		goto put_err;
+	}
 	f2fs_put_dnode(&dn);
 
 	if (unlikely(dn.data_blkaddr == NULL_ADDR)) {
 		err = -ENOENT;
+		if (next_pgofs)
+			*next_pgofs = index + 1;
 		goto put_err;
 	}
 	if (dn.data_blkaddr != NEW_ADDR &&
@@ -1281,7 +1287,8 @@ put_err:
 	return ERR_PTR(err);
 }
 
-struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index)
+struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index,
+					pgoff_t *next_pgofs)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
@@ -1291,7 +1298,7 @@ struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index)
 		return page;
 	f2fs_put_page(page, 0);
 
-	page = f2fs_get_read_data_page(inode, index, 0, false);
+	page = f2fs_get_read_data_page(inode, index, 0, false, next_pgofs);
 	if (IS_ERR(page))
 		return page;
 
@@ -1317,7 +1324,7 @@ struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index,
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
 repeat:
-	page = f2fs_get_read_data_page(inode, index, 0, for_write);
+	page = f2fs_get_read_data_page(inode, index, 0, for_write, NULL);
 	if (IS_ERR(page))
 		return page;
 
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 21960a899b6a..030b7fd4142f 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -340,6 +340,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
 	unsigned int bidx, end_block;
 	struct page *dentry_page;
 	struct f2fs_dir_entry *de = NULL;
+	pgoff_t next_pgofs;
 	bool room = false;
 	int max_slots;
 
@@ -350,12 +351,13 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
 			       le32_to_cpu(fname->hash) % nbucket);
 	end_block = bidx + nblock;
 
-	for (; bidx < end_block; bidx++) {
+	while (bidx < end_block) {
 		/* no need to allocate new dentry pages to all the indices */
-		dentry_page = f2fs_find_data_page(dir, bidx);
+		dentry_page = f2fs_find_data_page(dir, bidx, &next_pgofs);
 		if (IS_ERR(dentry_page)) {
 			if (PTR_ERR(dentry_page) == -ENOENT) {
 				room = true;
+				bidx = next_pgofs;
 				continue;
 			} else {
 				*res_page = dentry_page;
@@ -376,6 +378,8 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
 		if (max_slots >= s)
 			room = true;
 		f2fs_put_page(dentry_page, 0);
+
+		bidx++;
 	}
 
 	if (!de && room && F2FS_I(dir)->chash != fname->hash) {
@@ -956,7 +960,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
 
 bool f2fs_empty_dir(struct inode *dir)
 {
-	unsigned long bidx;
+	unsigned long bidx = 0;
 	struct page *dentry_page;
 	unsigned int bit_pos;
 	struct f2fs_dentry_block *dentry_blk;
@@ -965,13 +969,17 @@ bool f2fs_empty_dir(struct inode *dir)
 	if (f2fs_has_inline_dentry(dir))
 		return f2fs_empty_inline_dir(dir);
 
-	for (bidx = 0; bidx < nblock; bidx++) {
-		dentry_page = f2fs_get_lock_data_page(dir, bidx, false);
+	while (bidx < nblock) {
+		pgoff_t next_pgofs;
+
+		dentry_page = f2fs_find_data_page(dir, bidx, &next_pgofs);
 		if (IS_ERR(dentry_page)) {
-			if (PTR_ERR(dentry_page) == -ENOENT)
+			if (PTR_ERR(dentry_page) == -ENOENT) {
+				bidx = next_pgofs;
 				continue;
-			else
+			} else {
 				return false;
+			}
 		}
 
 		dentry_blk = page_address(dentry_page);
@@ -983,10 +991,12 @@ bool f2fs_empty_dir(struct inode *dir)
 						NR_DENTRY_IN_BLOCK,
 						bit_pos);
 
-		f2fs_put_page(dentry_page, 1);
+		f2fs_put_page(dentry_page, 0);
 
 		if (bit_pos < NR_DENTRY_IN_BLOCK)
 			return false;
+
+		bidx++;
 	}
 	return true;
 }
@@ -1104,7 +1114,8 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 		goto out_free;
 	}
 
-	for (; n < npages; n++, ctx->pos = n * NR_DENTRY_IN_BLOCK) {
+	for (; n < npages; ctx->pos = n * NR_DENTRY_IN_BLOCK) {
+		pgoff_t next_pgofs;
 
 		/* allow readdir() to be interrupted */
 		if (fatal_signal_pending(current)) {
@@ -1118,11 +1129,12 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 			page_cache_sync_readahead(inode->i_mapping, ra, file, n,
 				min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES));
 
-		dentry_page = f2fs_find_data_page(inode, n);
+		dentry_page = f2fs_find_data_page(inode, n, &next_pgofs);
 		if (IS_ERR(dentry_page)) {
 			err = PTR_ERR(dentry_page);
 			if (err == -ENOENT) {
 				err = 0;
+				n = next_pgofs;
 				continue;
 			} else {
 				goto out_free;
@@ -1141,6 +1153,8 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 		}
 
 		f2fs_put_page(dentry_page, 0);
+
+		n++;
 	}
 out_free:
 	fscrypt_fname_free_buffer(&fstr);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 11c475beca2c..6a8cbf5bb187 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -3807,8 +3807,9 @@ int f2fs_reserve_new_block(struct dnode_of_data *dn);
 int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index);
 int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index);
 struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
-			blk_opf_t op_flags, bool for_write);
-struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index);
+			blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs);
+struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index,
+							pgoff_t *next_pgofs);
 struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index,
 			bool for_write);
 struct page *f2fs_get_new_data_page(struct inode *inode,
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 6466db75af5d..f1a46519a5fe 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1562,8 +1562,8 @@ next_step:
 				continue;
 			}
 
-			data_page = f2fs_get_read_data_page(inode,
-						start_bidx, REQ_RAHEAD, true);
+			data_page = f2fs_get_read_data_page(inode, start_bidx,
+							REQ_RAHEAD, true, NULL);
 			f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
 			if (IS_ERR(data_page)) {
 				iput(inode);
-- 
cgit 


From 92b4cf5b48955a4bdd15fe4e2067db8ebd87f04c Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Wed, 9 Nov 2022 07:04:42 +0900
Subject: f2fs: initialize locks earlier in f2fs_fill_super()

syzbot is reporting lockdep warning at f2fs_handle_error() [1], for
spin_lock(&sbi->error_lock) is called before spin_lock_init() is called.
For safe locking in error handling, move initialization of locks (and
obvious structures) in f2fs_fill_super() to immediately after memory
allocation.

Link: https://syzkaller.appspot.com/bug?extid=40642be9b7e0bb28e0df [1]
Reported-by: syzbot <syzbot+40642be9b7e0bb28e0df@syzkaller.appspotmail.com>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Tested-by: syzbot <syzbot+40642be9b7e0bb28e0df@syzkaller.appspotmail.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 68a6c2eedcac..8f4fc3ad6765 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -4103,6 +4103,24 @@ try_onemore:
 
 	sbi->sb = sb;
 
+	/* initialize locks within allocated memory */
+	init_f2fs_rwsem(&sbi->gc_lock);
+	mutex_init(&sbi->writepages);
+	init_f2fs_rwsem(&sbi->cp_global_sem);
+	init_f2fs_rwsem(&sbi->node_write);
+	init_f2fs_rwsem(&sbi->node_change);
+	spin_lock_init(&sbi->stat_lock);
+	init_f2fs_rwsem(&sbi->cp_rwsem);
+	init_f2fs_rwsem(&sbi->quota_sem);
+	init_waitqueue_head(&sbi->cp_wait);
+	spin_lock_init(&sbi->error_lock);
+
+	for (i = 0; i < NR_INODE_TYPE; i++) {
+		INIT_LIST_HEAD(&sbi->inode_list[i]);
+		spin_lock_init(&sbi->inode_lock[i]);
+	}
+	mutex_init(&sbi->flush_lock);
+
 	/* Load the checksum driver */
 	sbi->s_chksum_driver = crypto_alloc_shash("crc32", 0, 0);
 	if (IS_ERR(sbi->s_chksum_driver)) {
@@ -4126,6 +4144,8 @@ try_onemore:
 	sb->s_fs_info = sbi;
 	sbi->raw_super = raw_super;
 
+	memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS);
+
 	/* precompute checksum seed for metadata */
 	if (f2fs_sb_has_inode_chksum(sbi))
 		sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid,
@@ -4182,26 +4202,14 @@ try_onemore:
 
 	/* init f2fs-specific super block info */
 	sbi->valid_super_block = valid_super_block;
-	init_f2fs_rwsem(&sbi->gc_lock);
-	mutex_init(&sbi->writepages);
-	init_f2fs_rwsem(&sbi->cp_global_sem);
-	init_f2fs_rwsem(&sbi->node_write);
-	init_f2fs_rwsem(&sbi->node_change);
 
 	/* disallow all the data/node/meta page writes */
 	set_sbi_flag(sbi, SBI_POR_DOING);
-	spin_lock_init(&sbi->stat_lock);
 
 	err = f2fs_init_write_merge_io(sbi);
 	if (err)
 		goto free_bio_info;
 
-	spin_lock_init(&sbi->error_lock);
-	memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS);
-
-	init_f2fs_rwsem(&sbi->cp_rwsem);
-	init_f2fs_rwsem(&sbi->quota_sem);
-	init_waitqueue_head(&sbi->cp_wait);
 	init_sb_info(sbi);
 
 	err = f2fs_init_iostat(sbi);
@@ -4279,12 +4287,6 @@ try_onemore:
 	limit_reserve_root(sbi);
 	adjust_unusable_cap_perc(sbi);
 
-	for (i = 0; i < NR_INODE_TYPE; i++) {
-		INIT_LIST_HEAD(&sbi->inode_list[i]);
-		spin_lock_init(&sbi->inode_lock[i]);
-	}
-	mutex_init(&sbi->flush_lock);
-
 	f2fs_init_extent_cache_info(sbi);
 
 	f2fs_init_ino_entry_info(sbi);
-- 
cgit 


From 967eaad1fed5f6335ea97a47d45214744dc57925 Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Thu, 10 Nov 2022 17:15:01 +0800
Subject: f2fs: fix to set flush_merge opt and show noflush_merge

Some minor modifications to flush_merge and related parameters:

  1.The FLUSH_MERGE opt is set by default only in non-ro mode.
  2.When ro and merge are set at the same time, an error is reported.
  3.Display noflush_merge mount opt.

Suggested-by: Chao Yu <chao@kernel.org>
Signed-off-by: Yangtao Li <frank.li@vivo.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 8f4fc3ad6765..75027ff85cd9 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1353,6 +1353,12 @@ default_check:
 		return -EINVAL;
 	}
 
+	if ((f2fs_sb_has_readonly(sbi) || f2fs_readonly(sbi->sb)) &&
+		test_opt(sbi, FLUSH_MERGE)) {
+		f2fs_err(sbi, "FLUSH_MERGE not compatible with readonly mode");
+		return -EINVAL;
+	}
+
 	if (f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) {
 		f2fs_err(sbi, "Allow to mount readonly mode only");
 		return -EROFS;
@@ -1941,8 +1947,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 		seq_puts(seq, ",inline_dentry");
 	else
 		seq_puts(seq, ",noinline_dentry");
-	if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE))
+	if (test_opt(sbi, FLUSH_MERGE))
 		seq_puts(seq, ",flush_merge");
+	else
+		seq_puts(seq, ",noflush_merge");
 	if (test_opt(sbi, NOBARRIER))
 		seq_puts(seq, ",nobarrier");
 	else
@@ -2073,7 +2081,8 @@ static void default_options(struct f2fs_sb_info *sbi)
 	set_opt(sbi, MERGE_CHECKPOINT);
 	F2FS_OPTION(sbi).unusable_cap = 0;
 	sbi->sb->s_flags |= SB_LAZYTIME;
-	set_opt(sbi, FLUSH_MERGE);
+	if (!f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb))
+		set_opt(sbi, FLUSH_MERGE);
 	if (f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi))
 		set_opt(sbi, DISCARD);
 	if (f2fs_sb_has_blkzoned(sbi)) {
-- 
cgit 


From 3b4c7bc01727e3a465759236eeac03d0dd686da3 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 4 Nov 2022 13:52:42 +0100
Subject: xattr: use rbtree for simple_xattrs

A while ago Vasily reported that it is possible to set a large number of
xattrs on inodes of filesystems that make use of the simple xattr
infrastructure. This includes all kernfs-based filesystems that support
xattrs (e.g., cgroupfs and tmpfs). Both cgroupfs and tmpfs can be
mounted by unprivileged users in unprivileged containers and root in an
unprivileged container can set an unrestricted number of security.*
xattrs and privileged users can also set unlimited trusted.* xattrs. As
there are apparently users that have a fairly large number of xattrs we
should scale a bit better. Other xattrs such as user.* are restricted
for kernfs-based instances to a fairly limited number.

Using a simple linked list protected by a spinlock used for set, get,
and list operations doesn't scale well if users use a lot of xattrs even
if it's not a crazy number. There's no need to bring in the big guns
like rhashtables or rw semaphores for this. An rbtree with a rwlock, or
limited rcu semanics and seqlock is enough.

It scales within the constraints we are working in. By far the most
common operation is getting an xattr. Setting xattrs should be a
moderately rare operation. And listxattr() often only happens when
copying xattrs between files or together with the contents to a new
file. Holding a lock across listxattr() is unproblematic because it
doesn't list the values of xattrs. It can only be used to list the names
of all xattrs set on a file. And the number of xattr names that can be
listed with listxattr() is limited to XATTR_LIST_MAX aka 65536 bytes. If
a larger buffer is passed then vfs_listxattr() caps it to XATTR_LIST_MAX
and if more xattr names are found it will return -E2BIG. In short, the
maximum amount of memory that can be retrieved via listxattr() is
limited.

Of course, the API is broken as documented on xattr(7) already. In the
future we might want to address this but for now this is the world we
live in and have lived for a long time. But it does indeed mean that
once an application goes over XATTR_LIST_MAX limit of xattrs set on an
inode it isn't possible to copy the file and include its xattrs in the
copy unless the caller knows all xattrs or limits the copy of the xattrs
to important ones it knows by name (At least for tmpfs, and kernfs-based
filesystems. Other filesystems might provide ways of achieving this.).

Bonus of this port to rbtree+rwlock is that we shrink the memory
consumption for users of the simple xattr infrastructure.

Also add proper kernel documentation to all the functions.
A big thanks to Paul for his comments.

Cc: Vasily Averin <vvs@openvz.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Acked-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/xattr.c | 317 ++++++++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 250 insertions(+), 67 deletions(-)

(limited to 'fs')

diff --git a/fs/xattr.c b/fs/xattr.c
index 61107b6bbed2..402d9d43fde0 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -992,8 +992,29 @@ const char *xattr_full_name(const struct xattr_handler *handler,
 }
 EXPORT_SYMBOL(xattr_full_name);
 
-/*
- * Allocate new xattr and copy in the value; but leave the name to callers.
+/**
+ * free_simple_xattr - free an xattr object
+ * @xattr: the xattr object
+ *
+ * Free the xattr object. Can handle @xattr being NULL.
+ */
+static inline void free_simple_xattr(struct simple_xattr *xattr)
+{
+	if (xattr)
+		kfree(xattr->name);
+	kvfree(xattr);
+}
+
+/**
+ * simple_xattr_alloc - allocate new xattr object
+ * @value: value of the xattr object
+ * @size: size of @value
+ *
+ * Allocate a new xattr object and initialize respective members. The caller is
+ * responsible for handling the name of the xattr.
+ *
+ * Return: On success a new xattr object is returned. On failure NULL is
+ * returned.
  */
 struct simple_xattr *simple_xattr_alloc(const void *value, size_t size)
 {
@@ -1014,20 +1035,69 @@ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size)
 	return new_xattr;
 }
 
-/*
- * xattr GET operation for in-memory/pseudo filesystems
+/**
+ * rbtree_simple_xattr_cmp - compare xattr name with current rbtree xattr entry
+ * @key: xattr name
+ * @node: current node
+ *
+ * Compare the xattr name with the xattr name attached to @node in the rbtree.
+ *
+ * Return: Negative value if continuing left, positive if continuing right, 0
+ * if the xattr attached to @node matches @key.
+ */
+static int rbtree_simple_xattr_cmp(const void *key, const struct rb_node *node)
+{
+	const char *xattr_name = key;
+	const struct simple_xattr *xattr;
+
+	xattr = rb_entry(node, struct simple_xattr, rb_node);
+	return strcmp(xattr->name, xattr_name);
+}
+
+/**
+ * rbtree_simple_xattr_node_cmp - compare two xattr rbtree nodes
+ * @new_node: new node
+ * @node: current node
+ *
+ * Compare the xattr attached to @new_node with the xattr attached to @node.
+ *
+ * Return: Negative value if continuing left, positive if continuing right, 0
+ * if the xattr attached to @new_node matches the xattr attached to @node.
+ */
+static int rbtree_simple_xattr_node_cmp(struct rb_node *new_node,
+					const struct rb_node *node)
+{
+	struct simple_xattr *xattr;
+	xattr = rb_entry(new_node, struct simple_xattr, rb_node);
+	return rbtree_simple_xattr_cmp(xattr->name, node);
+}
+
+/**
+ * simple_xattr_get - get an xattr object
+ * @xattrs: the header of the xattr object
+ * @name: the name of the xattr to retrieve
+ * @buffer: the buffer to store the value into
+ * @size: the size of @buffer
+ *
+ * Try to find and retrieve the xattr object associated with @name.
+ * If @buffer is provided store the value of @xattr in @buffer
+ * otherwise just return the length. The size of @buffer is limited
+ * to XATTR_SIZE_MAX which currently is 65536.
+ *
+ * Return: On success the length of the xattr value is returned. On error a
+ * negative error code is returned.
  */
 int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
 		     void *buffer, size_t size)
 {
-	struct simple_xattr *xattr;
+	struct simple_xattr *xattr = NULL;
+	struct rb_node *rbp;
 	int ret = -ENODATA;
 
-	spin_lock(&xattrs->lock);
-	list_for_each_entry(xattr, &xattrs->head, list) {
-		if (strcmp(name, xattr->name))
-			continue;
-
+	read_lock(&xattrs->lock);
+	rbp = rb_find(name, &xattrs->rb_root, rbtree_simple_xattr_cmp);
+	if (rbp) {
+		xattr = rb_entry(rbp, struct simple_xattr, rb_node);
 		ret = xattr->size;
 		if (buffer) {
 			if (size < xattr->size)
@@ -1035,34 +1105,44 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
 			else
 				memcpy(buffer, xattr->value, xattr->size);
 		}
-		break;
 	}
-	spin_unlock(&xattrs->lock);
+	read_unlock(&xattrs->lock);
 	return ret;
 }
 
 /**
- * simple_xattr_set - xattr SET operation for in-memory/pseudo filesystems
- * @xattrs: target simple_xattr list
- * @name: name of the extended attribute
- * @value: value of the xattr. If %NULL, will remove the attribute.
- * @size: size of the new xattr
- * @flags: %XATTR_{CREATE|REPLACE}
- * @removed_size: returns size of the removed xattr, -1 if none removed
+ * simple_xattr_set - set an xattr object
+ * @xattrs: the header of the xattr object
+ * @name: the name of the xattr to retrieve
+ * @value: the value to store along the xattr
+ * @size: the size of @value
+ * @flags: the flags determining how to set the xattr
+ * @removed_size: the size of the removed xattr
+ *
+ * Set a new xattr object.
+ * If @value is passed a new xattr object will be allocated. If XATTR_REPLACE
+ * is specified in @flags a matching xattr object for @name must already exist.
+ * If it does it will be replaced with the new xattr object. If it doesn't we
+ * fail. If XATTR_CREATE is specified and a matching xattr does already exist
+ * we fail. If it doesn't we create a new xattr. If @flags is zero we simply
+ * insert the new xattr replacing any existing one.
+ *
+ * If @value is empty and a matching xattr object is found we delete it if
+ * XATTR_REPLACE is specified in @flags or @flags is zero.
  *
- * %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails
- * with -EEXIST.  If %XATTR_REPLACE is set, the xattr should exist;
- * otherwise, fails with -ENODATA.
+ * If @value is empty and no matching xattr object for @name is found we do
+ * nothing if XATTR_CREATE is specified in @flags or @flags is zero. For
+ * XATTR_REPLACE we fail as mentioned above.
  *
- * Returns 0 on success, -errno on failure.
+ * Return: On success zero and on error a negative error code is returned.
  */
 int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
 		     const void *value, size_t size, int flags,
 		     ssize_t *removed_size)
 {
-	struct simple_xattr *xattr;
-	struct simple_xattr *new_xattr = NULL;
-	int err = 0;
+	struct simple_xattr *xattr = NULL, *new_xattr = NULL;
+	struct rb_node *parent = NULL, **rbp;
+	int err = 0, ret;
 
 	if (removed_size)
 		*removed_size = -1;
@@ -1075,42 +1155,68 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
 
 		new_xattr->name = kstrdup(name, GFP_KERNEL);
 		if (!new_xattr->name) {
-			kvfree(new_xattr);
+			free_simple_xattr(new_xattr);
 			return -ENOMEM;
 		}
 	}
 
-	spin_lock(&xattrs->lock);
-	list_for_each_entry(xattr, &xattrs->head, list) {
-		if (!strcmp(name, xattr->name)) {
-			if (flags & XATTR_CREATE) {
-				xattr = new_xattr;
-				err = -EEXIST;
-			} else if (new_xattr) {
-				list_replace(&xattr->list, &new_xattr->list);
-				if (removed_size)
-					*removed_size = xattr->size;
-			} else {
-				list_del(&xattr->list);
-				if (removed_size)
-					*removed_size = xattr->size;
-			}
-			goto out;
-		}
-	}
-	if (flags & XATTR_REPLACE) {
-		xattr = new_xattr;
-		err = -ENODATA;
-	} else {
-		list_add(&new_xattr->list, &xattrs->head);
-		xattr = NULL;
+	write_lock(&xattrs->lock);
+	rbp = &xattrs->rb_root.rb_node;
+	while (*rbp) {
+		parent = *rbp;
+		ret = rbtree_simple_xattr_cmp(name, *rbp);
+		if (ret < 0)
+			rbp = &(*rbp)->rb_left;
+		else if (ret > 0)
+			rbp = &(*rbp)->rb_right;
+		else
+			xattr = rb_entry(*rbp, struct simple_xattr, rb_node);
+		if (xattr)
+			break;
 	}
-out:
-	spin_unlock(&xattrs->lock);
+
 	if (xattr) {
-		kfree(xattr->name);
-		kvfree(xattr);
+		/* Fail if XATTR_CREATE is requested and the xattr exists. */
+		if (flags & XATTR_CREATE) {
+			err = -EEXIST;
+			goto out_unlock;
+		}
+
+		if (new_xattr)
+			rb_replace_node(&xattr->rb_node, &new_xattr->rb_node,
+					&xattrs->rb_root);
+		else
+			rb_erase(&xattr->rb_node, &xattrs->rb_root);
+		if (!err && removed_size)
+			*removed_size = xattr->size;
+	} else {
+		/* Fail if XATTR_REPLACE is requested but no xattr is found. */
+		if (flags & XATTR_REPLACE) {
+			err = -ENODATA;
+			goto out_unlock;
+		}
+
+		/*
+		 * If XATTR_CREATE or no flags are specified together with a
+		 * new value simply insert it.
+		 */
+		if (new_xattr) {
+			rb_link_node(&new_xattr->rb_node, parent, rbp);
+			rb_insert_color(&new_xattr->rb_node, &xattrs->rb_root);
+		}
+
+		/*
+		 * If XATTR_CREATE or no flags are specified and neither an
+		 * old or new xattr exist then we don't need to do anything.
+		 */
 	}
+
+out_unlock:
+	write_unlock(&xattrs->lock);
+	if (err)
+		free_simple_xattr(new_xattr);
+	else
+		free_simple_xattr(xattr);
 	return err;
 
 }
@@ -1134,14 +1240,31 @@ static int xattr_list_one(char **buffer, ssize_t *remaining_size,
 	return 0;
 }
 
-/*
- * xattr LIST operation for in-memory/pseudo filesystems
+/**
+ * simple_xattr_list - list all xattr objects
+ * @inode: inode from which to get the xattrs
+ * @xattrs: the header of the xattr object
+ * @buffer: the buffer to store all xattrs into
+ * @size: the size of @buffer
+ *
+ * List all xattrs associated with @inode. If @buffer is NULL we returned
+ * the required size of the buffer. If @buffer is provided we store the
+ * xattrs value into it provided it is big enough.
+ *
+ * Note, the number of xattr names that can be listed with listxattr(2) is
+ * limited to XATTR_LIST_MAX aka 65536 bytes. If a larger buffer is passed
+ * then vfs_listxattr() caps it to XATTR_LIST_MAX and if more xattr names
+ * are found it will return -E2BIG.
+ *
+ * Return: On success the required size or the size of the copied xattrs is
+ * returned. On error a negative error code is returned.
  */
 ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
 			  char *buffer, size_t size)
 {
 	bool trusted = capable(CAP_SYS_ADMIN);
 	struct simple_xattr *xattr;
+	struct rb_node *rbp;
 	ssize_t remaining_size = size;
 	int err = 0;
 
@@ -1162,8 +1285,10 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
 	}
 #endif
 
-	spin_lock(&xattrs->lock);
-	list_for_each_entry(xattr, &xattrs->head, list) {
+	read_lock(&xattrs->lock);
+	for (rbp = rb_first(&xattrs->rb_root); rbp; rbp = rb_next(rbp)) {
+		xattr = rb_entry(rbp, struct simple_xattr, rb_node);
+
 		/* skip "trusted." attributes for unprivileged callers */
 		if (!trusted && xattr_is_trusted(xattr->name))
 			continue;
@@ -1172,18 +1297,76 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
 		if (err)
 			break;
 	}
-	spin_unlock(&xattrs->lock);
+	read_unlock(&xattrs->lock);
 
 	return err ? err : size - remaining_size;
 }
 
-/*
- * Adds an extended attribute to the list
+/**
+ * rbtree_simple_xattr_less - compare two xattr rbtree nodes
+ * @new_node: new node
+ * @node: current node
+ *
+ * Compare the xattr attached to @new_node with the xattr attached to @node.
+ * Note that this function technically tolerates duplicate entries.
+ *
+ * Return: True if insertion point in the rbtree is found.
  */
-void simple_xattr_list_add(struct simple_xattrs *xattrs,
-			   struct simple_xattr *new_xattr)
+static bool rbtree_simple_xattr_less(struct rb_node *new_node,
+				     const struct rb_node *node)
 {
-	spin_lock(&xattrs->lock);
-	list_add(&new_xattr->list, &xattrs->head);
-	spin_unlock(&xattrs->lock);
+	return rbtree_simple_xattr_node_cmp(new_node, node) < 0;
+}
+
+/**
+ * simple_xattr_add - add xattr objects
+ * @xattrs: the header of the xattr object
+ * @new_xattr: the xattr object to add
+ *
+ * Add an xattr object to @xattrs. This assumes no replacement or removal
+ * of matching xattrs is wanted. Should only be called during inode
+ * initialization when a few distinct initial xattrs are supposed to be set.
+ */
+void simple_xattr_add(struct simple_xattrs *xattrs,
+		      struct simple_xattr *new_xattr)
+{
+	write_lock(&xattrs->lock);
+	rb_add(&new_xattr->rb_node, &xattrs->rb_root, rbtree_simple_xattr_less);
+	write_unlock(&xattrs->lock);
+}
+
+/**
+ * simple_xattrs_init - initialize new xattr header
+ * @xattrs: header to initialize
+ *
+ * Initialize relevant fields of a an xattr header.
+ */
+void simple_xattrs_init(struct simple_xattrs *xattrs)
+{
+	xattrs->rb_root = RB_ROOT;
+	rwlock_init(&xattrs->lock);
+}
+
+/**
+ * simple_xattrs_free - free xattrs
+ * @xattrs: xattr header whose xattrs to destroy
+ *
+ * Destroy all xattrs in @xattr. When this is called no one can hold a
+ * reference to any of the xattrs anymore.
+ */
+void simple_xattrs_free(struct simple_xattrs *xattrs)
+{
+	struct rb_node *rbp;
+
+	rbp = rb_first(&xattrs->rb_root);
+	while (rbp) {
+		struct simple_xattr *xattr;
+		struct rb_node *rbp_next;
+
+		rbp_next = rb_next(rbp);
+		xattr = rb_entry(rbp, struct simple_xattr, rb_node);
+		rb_erase(&xattr->rb_node, &xattrs->rb_root);
+		free_simple_xattr(xattr);
+		rbp = rbp_next;
+	}
 }
-- 
cgit 


From e40df4281b86d5f7c1615dd9eda597675340a8d3 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Sat, 12 Nov 2022 22:36:42 +0100
Subject: orangefs: fix mode handling

In 4053d2500beb ("orangefs: rework posix acl handling when creating new
filesystem objects") we tried to precalculate the correct mode when
creating a new inode. However, this leads to regressions when creating new
filesystem objects.

Even if we precalculate the mode we still need to call __orangefs_setattr()
to perform additional checks and we also need to update the mode of
ACL_TYPE_ACCESS acls set on the inode. The patch referenced above regressed
that. Restore that part of the old behavior and remove the mode
precalculation as it doesn't get us anything anymore.

Fixes: 4053d2500beb ("orangefs: rework posix acl handling when creating new filesystem objects")
Reported-by: Mike Marshall <hubcap@omnibond.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/orangefs/inode.c           | 11 ++++++++++-
 fs/orangefs/orangefs-kernel.h |  1 -
 fs/orangefs/orangefs-utils.c  | 10 ++--------
 3 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 8974b0fbf00d..3d65accfae17 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -1131,7 +1131,7 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
 	orangefs_set_inode(inode, ref);
 	inode->i_ino = hash;	/* needed for stat etc */
 
-	error = __orangefs_inode_getattr(inode, mode, ORANGEFS_GETATTR_NEW);
+	error = orangefs_inode_getattr(inode, ORANGEFS_GETATTR_NEW);
 	if (error)
 		goto out_iput;
 
@@ -1158,6 +1158,15 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
 	gossip_debug(GOSSIP_INODE_DEBUG,
 		     "Initializing ACL's for inode %pU\n",
 		     get_khandle_from_ino(inode));
+	if (mode != inode->i_mode) {
+		struct iattr iattr = {
+			.ia_mode = mode,
+			.ia_valid = ATTR_MODE,
+		};
+		inode->i_mode = mode;
+		__orangefs_setattr(inode, &iattr);
+		__posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
+	}
 	posix_acl_release(acl);
 	posix_acl_release(default_acl);
 	return inode;
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 55cd6d50eea1..6e0cc01b3a14 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -423,7 +423,6 @@ int orangefs_inode_setxattr(struct inode *inode,
 #define ORANGEFS_GETATTR_SIZE 2
 
 int orangefs_inode_getattr(struct inode *, int);
-int __orangefs_inode_getattr(struct inode *inode, umode_t mode, int flags);
 
 int orangefs_inode_check_changed(struct inode *inode);
 
diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
index 334a2fd98c37..46b7dcff18ac 100644
--- a/fs/orangefs/orangefs-utils.c
+++ b/fs/orangefs/orangefs-utils.c
@@ -233,7 +233,7 @@ static int orangefs_inode_is_stale(struct inode *inode,
 	return 0;
 }
 
-int __orangefs_inode_getattr(struct inode *inode, umode_t mode, int flags)
+int orangefs_inode_getattr(struct inode *inode, int flags)
 {
 	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
 	struct orangefs_kernel_op_s *new_op;
@@ -369,8 +369,7 @@ again2:
 
 	/* special case: mark the root inode as sticky */
 	inode->i_mode = type | (is_root_handle(inode) ? S_ISVTX : 0) |
-	    orangefs_inode_perms(&new_op->downcall.resp.getattr.attributes) |
-	    mode;
+	    orangefs_inode_perms(&new_op->downcall.resp.getattr.attributes);
 
 	orangefs_inode->getattr_time = jiffies +
 	    orangefs_getattr_timeout_msecs*HZ/1000;
@@ -382,11 +381,6 @@ out:
 	return ret;
 }
 
-int orangefs_inode_getattr(struct inode *inode, int flags)
-{
-	return __orangefs_inode_getattr(inode, 0, flags);
-}
-
 int orangefs_inode_check_changed(struct inode *inode)
 {
 	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
-- 
cgit 


From f1bd37a4735286585751dbd9db330b48525cb193 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Mon, 14 Nov 2022 10:35:32 -0800
Subject: iomap: directly use logical block size

Don't transform the logical block size to a bit shift only to shift it
back to the original block size. Just use the size.

Cc: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/iomap/direct-io.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 4eb559a16c9e..9804714b1751 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -240,7 +240,6 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 {
 	const struct iomap *iomap = &iter->iomap;
 	struct inode *inode = iter->inode;
-	unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
 	unsigned int fs_block_size = i_blocksize(inode), pad;
 	loff_t length = iomap_length(iter);
 	loff_t pos = iter->pos;
@@ -252,7 +251,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 	size_t copied = 0;
 	size_t orig_count;
 
-	if ((pos | length) & ((1 << blkbits) - 1) ||
+	if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
 	    !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
 		return -EINVAL;
 
-- 
cgit 


From bdcdd86ca94b5e9faa18d6f4d3dda660ac5c887e Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 11 Nov 2022 00:54:40 +0000
Subject: btrfs: fix assertion failure and blocking during nowait buffered
 write

When doing a nowait buffered write we can trigger the following assertion:

[11138.437027] assertion failed: !path->nowait, in fs/btrfs/ctree.c:4658
[11138.438251] ------------[ cut here ]------------
[11138.438254] kernel BUG at fs/btrfs/messages.c:259!
[11138.438762] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI
[11138.439450] CPU: 4 PID: 1091021 Comm: fsstress Not tainted 6.1.0-rc4-btrfs-next-128 #1
[11138.440611] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
[11138.442553] RIP: 0010:btrfs_assertfail+0x19/0x1b [btrfs]
[11138.443583] Code: 5b 41 5a 41 (...)
[11138.446437] RSP: 0018:ffffbaf0cf05b840 EFLAGS: 00010246
[11138.447235] RAX: 0000000000000039 RBX: ffffbaf0cf05b938 RCX: 0000000000000000
[11138.448303] RDX: 0000000000000000 RSI: ffffffffb2ef59f6 RDI: 00000000ffffffff
[11138.449370] RBP: ffff9165f581eb68 R08: 00000000ffffffff R09: 0000000000000001
[11138.450493] R10: ffff9167a88421f8 R11: 0000000000000000 R12: ffff9164981b1000
[11138.451661] R13: 000000008c8f1000 R14: ffff9164991d4000 R15: ffff9164981b1000
[11138.452225] FS:  00007f1438a66440(0000) GS:ffff9167ad600000(0000) knlGS:0000000000000000
[11138.452949] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[11138.453394] CR2: 00007f1438a64000 CR3: 0000000100c36002 CR4: 0000000000370ee0
[11138.454057] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[11138.454879] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[11138.455779] Call Trace:
[11138.456211]  <TASK>
[11138.456598]  btrfs_next_old_leaf.cold+0x18/0x1d [btrfs]
[11138.457827]  ? kmem_cache_alloc+0x18d/0x2a0
[11138.458516]  btrfs_lookup_csums_range+0x149/0x4d0 [btrfs]
[11138.459407]  csum_exist_in_range+0x56/0x110 [btrfs]
[11138.460271]  can_nocow_file_extent+0x27c/0x310 [btrfs]
[11138.461155]  can_nocow_extent+0x1ec/0x2e0 [btrfs]
[11138.461672]  btrfs_check_nocow_lock+0x114/0x1c0 [btrfs]
[11138.462951]  btrfs_buffered_write+0x44c/0x8e0 [btrfs]
[11138.463482]  btrfs_do_write_iter+0x42b/0x5f0 [btrfs]
[11138.463982]  ? lock_release+0x153/0x4a0
[11138.464347]  io_write+0x11b/0x570
[11138.464660]  ? lock_release+0x153/0x4a0
[11138.465213]  ? lock_is_held_type+0xe8/0x140
[11138.466003]  io_issue_sqe+0x63/0x4a0
[11138.466339]  io_submit_sqes+0x238/0x770
[11138.466741]  __do_sys_io_uring_enter+0x37b/0xb10
[11138.467206]  ? lock_is_held_type+0xe8/0x140
[11138.467879]  ? syscall_enter_from_user_mode+0x1d/0x50
[11138.468688]  do_syscall_64+0x38/0x90
[11138.469265]  entry_SYSCALL_64_after_hwframe+0x63/0xcd
[11138.470017] RIP: 0033:0x7f1438c539e6

This is because to check if we can NOCOW, we check that if we can NOCOW
into an extent (it's prealloc extent or the inode has NOCOW attribute),
and then check if there are csums for the extent's range in the csum tree.
The search may leave us beyond the last slot of a leaf, and then when
we call btrfs_next_leaf() we end up at btrfs_next_old_leaf() with a
time_seq of 0.

This triggers a failure of the first assertion at btrfs_next_old_leaf(),
since we have a nowait path. With assertions disabled, we simply don't
respect the NOWAIT semantics, allowing the write to block on locks or
blocking on IO for reading an extent buffer from disk.

Fix this by:

1) Triggering the assertion only if time_seq is not 0, which means that
   search is being done by a tree mod log user, and in the buffered and
   direct IO write paths we don't use the tree mod log;

2) Implementing NOWAIT semantics at btrfs_next_old_leaf(). Any failure to
   lock an extent buffer should return immediately and not retry the
   search, as well as if we need to do IO to read an extent buffer from
   disk.

Fixes: c922b016f353 ("btrfs: assert nowait mode is not used for some btree search functions")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 36 ++++++++++++++++++++++++++++++------
 1 file changed, 30 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index a9543f01184c..dcb510f38dda 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -4663,7 +4663,12 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
 	int ret;
 	int i;
 
-	ASSERT(!path->nowait);
+	/*
+	 * The nowait semantics are used only for write paths, where we don't
+	 * use the tree mod log and sequence numbers.
+	 */
+	if (time_seq)
+		ASSERT(!path->nowait);
 
 	nritems = btrfs_header_nritems(path->nodes[0]);
 	if (nritems == 0)
@@ -4683,7 +4688,14 @@ again:
 		if (path->need_commit_sem) {
 			path->need_commit_sem = 0;
 			need_commit_sem = true;
-			down_read(&fs_info->commit_root_sem);
+			if (path->nowait) {
+				if (!down_read_trylock(&fs_info->commit_root_sem)) {
+					ret = -EAGAIN;
+					goto done;
+				}
+			} else {
+				down_read(&fs_info->commit_root_sem);
+			}
 		}
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	}
@@ -4759,7 +4771,7 @@ again:
 		next = c;
 		ret = read_block_for_search(root, path, &next, level,
 					    slot, &key);
-		if (ret == -EAGAIN)
+		if (ret == -EAGAIN && !path->nowait)
 			goto again;
 
 		if (ret < 0) {
@@ -4769,6 +4781,10 @@ again:
 
 		if (!path->skip_locking) {
 			ret = btrfs_try_tree_read_lock(next);
+			if (!ret && path->nowait) {
+				ret = -EAGAIN;
+				goto done;
+			}
 			if (!ret && time_seq) {
 				/*
 				 * If we don't get the lock, we may be racing
@@ -4799,7 +4815,7 @@ again:
 
 		ret = read_block_for_search(root, path, &next, level,
 					    0, &key);
-		if (ret == -EAGAIN)
+		if (ret == -EAGAIN && !path->nowait)
 			goto again;
 
 		if (ret < 0) {
@@ -4807,8 +4823,16 @@ again:
 			goto done;
 		}
 
-		if (!path->skip_locking)
-			btrfs_tree_read_lock(next);
+		if (!path->skip_locking) {
+			if (path->nowait) {
+				if (!btrfs_try_tree_read_lock(next)) {
+					ret = -EAGAIN;
+					goto done;
+				}
+			} else {
+				btrfs_tree_read_lock(next);
+			}
+		}
 	}
 	ret = 0;
 done:
-- 
cgit 


From b740d806166979488e798e41743aaec051f2443f Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Mon, 7 Nov 2022 11:44:51 -0500
Subject: btrfs: free btrfs_path before copying root refs to userspace

Syzbot reported the following lockdep splat

======================================================
WARNING: possible circular locking dependency detected
6.0.0-rc7-syzkaller-18095-gbbed346d5a96 #0 Not tainted
------------------------------------------------------
syz-executor307/3029 is trying to acquire lock:
ffff0000c02525d8 (&mm->mmap_lock){++++}-{3:3}, at: __might_fault+0x54/0xb4 mm/memory.c:5576

but task is already holding lock:
ffff0000c958a608 (btrfs-root-00){++++}-{3:3}, at: __btrfs_tree_read_lock fs/btrfs/locking.c:134 [inline]
ffff0000c958a608 (btrfs-root-00){++++}-{3:3}, at: btrfs_tree_read_lock fs/btrfs/locking.c:140 [inline]
ffff0000c958a608 (btrfs-root-00){++++}-{3:3}, at: btrfs_read_lock_root_node+0x13c/0x1c0 fs/btrfs/locking.c:279

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #3 (btrfs-root-00){++++}-{3:3}:
       down_read_nested+0x64/0x84 kernel/locking/rwsem.c:1624
       __btrfs_tree_read_lock fs/btrfs/locking.c:134 [inline]
       btrfs_tree_read_lock fs/btrfs/locking.c:140 [inline]
       btrfs_read_lock_root_node+0x13c/0x1c0 fs/btrfs/locking.c:279
       btrfs_search_slot_get_root+0x74/0x338 fs/btrfs/ctree.c:1637
       btrfs_search_slot+0x1b0/0xfd8 fs/btrfs/ctree.c:1944
       btrfs_update_root+0x6c/0x5a0 fs/btrfs/root-tree.c:132
       commit_fs_roots+0x1f0/0x33c fs/btrfs/transaction.c:1459
       btrfs_commit_transaction+0x89c/0x12d8 fs/btrfs/transaction.c:2343
       flush_space+0x66c/0x738 fs/btrfs/space-info.c:786
       btrfs_async_reclaim_metadata_space+0x43c/0x4e0 fs/btrfs/space-info.c:1059
       process_one_work+0x2d8/0x504 kernel/workqueue.c:2289
       worker_thread+0x340/0x610 kernel/workqueue.c:2436
       kthread+0x12c/0x158 kernel/kthread.c:376
       ret_from_fork+0x10/0x20 arch/arm64/kernel/entry.S:860

-> #2 (&fs_info->reloc_mutex){+.+.}-{3:3}:
       __mutex_lock_common+0xd4/0xca8 kernel/locking/mutex.c:603
       __mutex_lock kernel/locking/mutex.c:747 [inline]
       mutex_lock_nested+0x38/0x44 kernel/locking/mutex.c:799
       btrfs_record_root_in_trans fs/btrfs/transaction.c:516 [inline]
       start_transaction+0x248/0x944 fs/btrfs/transaction.c:752
       btrfs_start_transaction+0x34/0x44 fs/btrfs/transaction.c:781
       btrfs_create_common+0xf0/0x1b4 fs/btrfs/inode.c:6651
       btrfs_create+0x8c/0xb0 fs/btrfs/inode.c:6697
       lookup_open fs/namei.c:3413 [inline]
       open_last_lookups fs/namei.c:3481 [inline]
       path_openat+0x804/0x11c4 fs/namei.c:3688
       do_filp_open+0xdc/0x1b8 fs/namei.c:3718
       do_sys_openat2+0xb8/0x22c fs/open.c:1313
       do_sys_open fs/open.c:1329 [inline]
       __do_sys_openat fs/open.c:1345 [inline]
       __se_sys_openat fs/open.c:1340 [inline]
       __arm64_sys_openat+0xb0/0xe0 fs/open.c:1340
       __invoke_syscall arch/arm64/kernel/syscall.c:38 [inline]
       invoke_syscall arch/arm64/kernel/syscall.c:52 [inline]
       el0_svc_common+0x138/0x220 arch/arm64/kernel/syscall.c:142
       do_el0_svc+0x48/0x164 arch/arm64/kernel/syscall.c:206
       el0_svc+0x58/0x150 arch/arm64/kernel/entry-common.c:636
       el0t_64_sync_handler+0x84/0xf0 arch/arm64/kernel/entry-common.c:654
       el0t_64_sync+0x18c/0x190 arch/arm64/kernel/entry.S:581

-> #1 (sb_internal#2){.+.+}-{0:0}:
       percpu_down_read include/linux/percpu-rwsem.h:51 [inline]
       __sb_start_write include/linux/fs.h:1826 [inline]
       sb_start_intwrite include/linux/fs.h:1948 [inline]
       start_transaction+0x360/0x944 fs/btrfs/transaction.c:683
       btrfs_join_transaction+0x30/0x40 fs/btrfs/transaction.c:795
       btrfs_dirty_inode+0x50/0x140 fs/btrfs/inode.c:6103
       btrfs_update_time+0x1c0/0x1e8 fs/btrfs/inode.c:6145
       inode_update_time fs/inode.c:1872 [inline]
       touch_atime+0x1f0/0x4a8 fs/inode.c:1945
       file_accessed include/linux/fs.h:2516 [inline]
       btrfs_file_mmap+0x50/0x88 fs/btrfs/file.c:2407
       call_mmap include/linux/fs.h:2192 [inline]
       mmap_region+0x7fc/0xc14 mm/mmap.c:1752
       do_mmap+0x644/0x97c mm/mmap.c:1540
       vm_mmap_pgoff+0xe8/0x1d0 mm/util.c:552
       ksys_mmap_pgoff+0x1cc/0x278 mm/mmap.c:1586
       __do_sys_mmap arch/arm64/kernel/sys.c:28 [inline]
       __se_sys_mmap arch/arm64/kernel/sys.c:21 [inline]
       __arm64_sys_mmap+0x58/0x6c arch/arm64/kernel/sys.c:21
       __invoke_syscall arch/arm64/kernel/syscall.c:38 [inline]
       invoke_syscall arch/arm64/kernel/syscall.c:52 [inline]
       el0_svc_common+0x138/0x220 arch/arm64/kernel/syscall.c:142
       do_el0_svc+0x48/0x164 arch/arm64/kernel/syscall.c:206
       el0_svc+0x58/0x150 arch/arm64/kernel/entry-common.c:636
       el0t_64_sync_handler+0x84/0xf0 arch/arm64/kernel/entry-common.c:654
       el0t_64_sync+0x18c/0x190 arch/arm64/kernel/entry.S:581

-> #0 (&mm->mmap_lock){++++}-{3:3}:
       check_prev_add kernel/locking/lockdep.c:3095 [inline]
       check_prevs_add kernel/locking/lockdep.c:3214 [inline]
       validate_chain kernel/locking/lockdep.c:3829 [inline]
       __lock_acquire+0x1530/0x30a4 kernel/locking/lockdep.c:5053
       lock_acquire+0x100/0x1f8 kernel/locking/lockdep.c:5666
       __might_fault+0x7c/0xb4 mm/memory.c:5577
       _copy_to_user include/linux/uaccess.h:134 [inline]
       copy_to_user include/linux/uaccess.h:160 [inline]
       btrfs_ioctl_get_subvol_rootref+0x3a8/0x4bc fs/btrfs/ioctl.c:3203
       btrfs_ioctl+0xa08/0xa64 fs/btrfs/ioctl.c:5556
       vfs_ioctl fs/ioctl.c:51 [inline]
       __do_sys_ioctl fs/ioctl.c:870 [inline]
       __se_sys_ioctl fs/ioctl.c:856 [inline]
       __arm64_sys_ioctl+0xd0/0x140 fs/ioctl.c:856
       __invoke_syscall arch/arm64/kernel/syscall.c:38 [inline]
       invoke_syscall arch/arm64/kernel/syscall.c:52 [inline]
       el0_svc_common+0x138/0x220 arch/arm64/kernel/syscall.c:142
       do_el0_svc+0x48/0x164 arch/arm64/kernel/syscall.c:206
       el0_svc+0x58/0x150 arch/arm64/kernel/entry-common.c:636
       el0t_64_sync_handler+0x84/0xf0 arch/arm64/kernel/entry-common.c:654
       el0t_64_sync+0x18c/0x190 arch/arm64/kernel/entry.S:581

other info that might help us debug this:

Chain exists of:
  &mm->mmap_lock --> &fs_info->reloc_mutex --> btrfs-root-00

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(btrfs-root-00);
                               lock(&fs_info->reloc_mutex);
                               lock(btrfs-root-00);
  lock(&mm->mmap_lock);

 *** DEADLOCK ***

1 lock held by syz-executor307/3029:
 #0: ffff0000c958a608 (btrfs-root-00){++++}-{3:3}, at: __btrfs_tree_read_lock fs/btrfs/locking.c:134 [inline]
 #0: ffff0000c958a608 (btrfs-root-00){++++}-{3:3}, at: btrfs_tree_read_lock fs/btrfs/locking.c:140 [inline]
 #0: ffff0000c958a608 (btrfs-root-00){++++}-{3:3}, at: btrfs_read_lock_root_node+0x13c/0x1c0 fs/btrfs/locking.c:279

stack backtrace:
CPU: 0 PID: 3029 Comm: syz-executor307 Not tainted 6.0.0-rc7-syzkaller-18095-gbbed346d5a96 #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/30/2022
Call trace:
 dump_backtrace+0x1c4/0x1f0 arch/arm64/kernel/stacktrace.c:156
 show_stack+0x2c/0x54 arch/arm64/kernel/stacktrace.c:163
 __dump_stack lib/dump_stack.c:88 [inline]
 dump_stack_lvl+0x104/0x16c lib/dump_stack.c:106
 dump_stack+0x1c/0x58 lib/dump_stack.c:113
 print_circular_bug+0x2c4/0x2c8 kernel/locking/lockdep.c:2053
 check_noncircular+0x14c/0x154 kernel/locking/lockdep.c:2175
 check_prev_add kernel/locking/lockdep.c:3095 [inline]
 check_prevs_add kernel/locking/lockdep.c:3214 [inline]
 validate_chain kernel/locking/lockdep.c:3829 [inline]
 __lock_acquire+0x1530/0x30a4 kernel/locking/lockdep.c:5053
 lock_acquire+0x100/0x1f8 kernel/locking/lockdep.c:5666
 __might_fault+0x7c/0xb4 mm/memory.c:5577
 _copy_to_user include/linux/uaccess.h:134 [inline]
 copy_to_user include/linux/uaccess.h:160 [inline]
 btrfs_ioctl_get_subvol_rootref+0x3a8/0x4bc fs/btrfs/ioctl.c:3203
 btrfs_ioctl+0xa08/0xa64 fs/btrfs/ioctl.c:5556
 vfs_ioctl fs/ioctl.c:51 [inline]
 __do_sys_ioctl fs/ioctl.c:870 [inline]
 __se_sys_ioctl fs/ioctl.c:856 [inline]
 __arm64_sys_ioctl+0xd0/0x140 fs/ioctl.c:856
 __invoke_syscall arch/arm64/kernel/syscall.c:38 [inline]
 invoke_syscall arch/arm64/kernel/syscall.c:52 [inline]
 el0_svc_common+0x138/0x220 arch/arm64/kernel/syscall.c:142
 do_el0_svc+0x48/0x164 arch/arm64/kernel/syscall.c:206
 el0_svc+0x58/0x150 arch/arm64/kernel/entry-common.c:636
 el0t_64_sync_handler+0x84/0xf0 arch/arm64/kernel/entry-common.c:654
 el0t_64_sync+0x18c/0x190 arch/arm64/kernel/entry.S:581

We do generally the right thing here, copying the references into a
temporary buffer, however we are still holding the path when we do
copy_to_user from the temporary buffer.  Fix this by freeing the path
before we copy to user space.

Reported-by: syzbot+4ef9e52e464c6ff47d9d@syzkaller.appspotmail.com
CC: stable@vger.kernel.org # 4.19+
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d5dd8bed1488..89b8d14cb68c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3194,6 +3194,8 @@ static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root,
 	}
 
 out:
+	btrfs_free_path(path);
+
 	if (!ret || ret == -EOVERFLOW) {
 		rootrefs->num_items = found;
 		/* update min_treeid for next search */
@@ -3205,7 +3207,6 @@ out:
 	}
 
 	kfree(rootrefs);
-	btrfs_free_path(path);
 
 	return ret;
 }
-- 
cgit 


From 418ffb9e3cf6c4e2574d3a732b724916684bd133 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Thu, 10 Nov 2022 11:36:28 +0530
Subject: btrfs: free btrfs_path before copying inodes to userspace

btrfs_ioctl_logical_to_ino() frees the search path after the userspace
copy from the temp buffer @inodes. Which potentially can lead to a lock
splat.

Fix this by freeing the path before we copy @inodes to userspace.

CC: stable@vger.kernel.org # 4.19+
Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 89b8d14cb68c..b595f2c6dfc9 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4282,21 +4282,20 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
 		size = min_t(u32, loi->size, SZ_16M);
 	}
 
-	path = btrfs_alloc_path();
-	if (!path) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
 	inodes = init_data_container(size);
 	if (IS_ERR(inodes)) {
 		ret = PTR_ERR(inodes);
-		inodes = NULL;
-		goto out;
+		goto out_loi;
 	}
 
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
 	ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
 					  inodes, ignore_offset);
+	btrfs_free_path(path);
 	if (ret == -EINVAL)
 		ret = -ENOENT;
 	if (ret < 0)
@@ -4308,7 +4307,6 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
 		ret = -EFAULT;
 
 out:
-	btrfs_free_path(path);
 	kvfree(inodes);
 out_loi:
 	kfree(loi);
-- 
cgit 


From 8cf96b409d9b3946ece58ced13f92d0f775b0442 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Thu, 10 Nov 2022 11:36:29 +0530
Subject: btrfs: free btrfs_path before copying fspath to userspace

btrfs_ioctl_ino_to_path() frees the search path after the userspace copy
from the temp buffer @ipath->fspath. Which potentially can lead to a lock
splat warning.

Fix this by freeing the path before we copy it to userspace.

CC: stable@vger.kernel.org # 4.19+
Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index b595f2c6dfc9..df5b893494fa 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4232,6 +4232,8 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
 		ipath->fspath->val[i] = rel_ptr;
 	}
 
+	btrfs_free_path(path);
+	path = NULL;
 	ret = copy_to_user((void __user *)(unsigned long)ipa->fspath,
 			   ipath->fspath, size);
 	if (ret) {
-- 
cgit 


From 013c1c5585ebcfb19c88efe79063d0463b1b6159 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Thu, 10 Nov 2022 11:36:31 +0530
Subject: btrfs: free btrfs_path before copying subvol info to userspace

btrfs_ioctl_get_subvol_info() frees the search path after the userspace
copy from the temp buffer @subvol_info. This can lead to a lock splat
warning.

Fix this by freeing the path before we copy it to userspace.

CC: stable@vger.kernel.org # 4.19+
Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index df5b893494fa..5ba2e810dc6e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3105,6 +3105,8 @@ static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp)
 		}
 	}
 
+	btrfs_free_path(path);
+	path = NULL;
 	if (copy_to_user(argp, subvol_info, sizeof(*subvol_info)))
 		ret = -EFAULT;
 
-- 
cgit 


From 02aef4225258fa6d022ce9040716aeecc3afc521 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 10 Nov 2022 00:29:42 -0800
Subject: fscrypt: pass super_block to fscrypt_put_master_key_activeref()

As this code confused Linus [1], pass the super_block as an argument to
fscrypt_put_master_key_activeref().  This removes the need to have the
back-pointer ->mk_sb, so remove that.

[1] https://lore.kernel.org/linux-fscrypt/CAHk-=wgud4Bc_um+htgfagYpZAnOoCb3NUoW67hc9LhOKsMtJg@mail.gmail.com

Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20221110082942.351615-1-ebiggers@kernel.org
---
 fs/crypto/fscrypt_private.h | 13 ++++---------
 fs/crypto/keyring.c         | 14 ++++++--------
 fs/crypto/keysetup.c        |  2 +-
 3 files changed, 11 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index d5f68a0c5d15..316a778cec0f 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -439,13 +439,7 @@ struct fscrypt_master_key_secret {
 struct fscrypt_master_key {
 
 	/*
-	 * Back-pointer to the super_block of the filesystem to which this
-	 * master key has been added.  Only valid if ->mk_active_refs > 0.
-	 */
-	struct super_block			*mk_sb;
-
-	/*
-	 * Link in ->mk_sb->s_master_keys->key_hashtable.
+	 * Link in ->s_master_keys->key_hashtable.
 	 * Only valid if ->mk_active_refs > 0.
 	 */
 	struct hlist_node			mk_node;
@@ -456,7 +450,7 @@ struct fscrypt_master_key {
 	/*
 	 * Active and structural reference counts.  An active ref guarantees
 	 * that the struct continues to exist, continues to be in the keyring
-	 * ->mk_sb->s_master_keys, and that any embedded subkeys (e.g.
+	 * ->s_master_keys, and that any embedded subkeys (e.g.
 	 * ->mk_direct_keys) that have been prepared continue to exist.
 	 * A structural ref only guarantees that the struct continues to exist.
 	 *
@@ -569,7 +563,8 @@ static inline int master_key_spec_len(const struct fscrypt_key_specifier *spec)
 
 void fscrypt_put_master_key(struct fscrypt_master_key *mk);
 
-void fscrypt_put_master_key_activeref(struct fscrypt_master_key *mk);
+void fscrypt_put_master_key_activeref(struct super_block *sb,
+				      struct fscrypt_master_key *mk);
 
 struct fscrypt_master_key *
 fscrypt_find_master_key(struct super_block *sb,
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index 2a24b1f0ae68..78dd2ff306bd 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -79,10 +79,9 @@ void fscrypt_put_master_key(struct fscrypt_master_key *mk)
 	call_rcu(&mk->mk_rcu_head, fscrypt_free_master_key);
 }
 
-void fscrypt_put_master_key_activeref(struct fscrypt_master_key *mk)
+void fscrypt_put_master_key_activeref(struct super_block *sb,
+				      struct fscrypt_master_key *mk)
 {
-	struct super_block *sb = mk->mk_sb;
-	struct fscrypt_keyring *keyring = sb->s_master_keys;
 	size_t i;
 
 	if (!refcount_dec_and_test(&mk->mk_active_refs))
@@ -93,9 +92,9 @@ void fscrypt_put_master_key_activeref(struct fscrypt_master_key *mk)
 	 * destroying any subkeys embedded in it.
 	 */
 
-	spin_lock(&keyring->lock);
+	spin_lock(&sb->s_master_keys->lock);
 	hlist_del_rcu(&mk->mk_node);
-	spin_unlock(&keyring->lock);
+	spin_unlock(&sb->s_master_keys->lock);
 
 	/*
 	 * ->mk_active_refs == 0 implies that ->mk_secret is not present and
@@ -243,7 +242,7 @@ void fscrypt_destroy_keyring(struct super_block *sb)
 			WARN_ON(refcount_read(&mk->mk_struct_refs) != 1);
 			WARN_ON(!is_master_key_secret_present(&mk->mk_secret));
 			wipe_master_key_secret(&mk->mk_secret);
-			fscrypt_put_master_key_activeref(mk);
+			fscrypt_put_master_key_activeref(sb, mk);
 		}
 	}
 	kfree_sensitive(keyring);
@@ -424,7 +423,6 @@ static int add_new_master_key(struct super_block *sb,
 	if (!mk)
 		return -ENOMEM;
 
-	mk->mk_sb = sb;
 	init_rwsem(&mk->mk_sem);
 	refcount_set(&mk->mk_struct_refs, 1);
 	mk->mk_spec = *mk_spec;
@@ -1068,7 +1066,7 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users)
 	err = -ENOKEY;
 	if (is_master_key_secret_present(&mk->mk_secret)) {
 		wipe_master_key_secret(&mk->mk_secret);
-		fscrypt_put_master_key_activeref(mk);
+		fscrypt_put_master_key_activeref(sb, mk);
 		err = 0;
 	}
 	inodes_remain = refcount_read(&mk->mk_active_refs) > 0;
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index f7407071a952..9e44dc078a81 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -509,7 +509,7 @@ static void put_crypt_info(struct fscrypt_info *ci)
 		spin_lock(&mk->mk_decrypted_inodes_lock);
 		list_del(&ci->ci_master_key_link);
 		spin_unlock(&mk->mk_decrypted_inodes_lock);
-		fscrypt_put_master_key_activeref(mk);
+		fscrypt_put_master_key_activeref(ci->ci_inode->i_sb, mk);
 	}
 	memzero_explicit(ci, sizeof(*ci));
 	kmem_cache_free(fscrypt_info_cachep, ci);
-- 
cgit 


From 9a48b4a6fd512bdaed7e38ba844be743163d49c6 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Sun, 6 Nov 2022 17:03:13 -0800
Subject: xfs: fully initialize xfs_da_args in xchk_directory_blocks

While running the online fsck test suite, I noticed the following
assertion in the kernel log (edited for brevity):

XFS: Assertion failed: 0, file: fs/xfs/xfs_health.c, line: 571
------------[ cut here ]------------
WARNING: CPU: 3 PID: 11667 at fs/xfs/xfs_message.c:104 assfail+0x46/0x4a [xfs]
CPU: 3 PID: 11667 Comm: xfs_scrub Tainted: G        W         5.19.0-rc7-xfsx #rc7 6e6475eb29fd9dda3181f81b7ca7ff961d277a40
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.15.0-1 04/01/2014
RIP: 0010:assfail+0x46/0x4a [xfs]
Call Trace:
 <TASK>
 xfs_dir2_isblock+0xcc/0xe0
 xchk_directory_blocks+0xc7/0x420
 xchk_directory+0x53/0xb0
 xfs_scrub_metadata+0x2b6/0x6b0
 xfs_scrubv_metadata+0x35e/0x4d0
 xfs_ioc_scrubv_metadata+0x111/0x160
 xfs_file_ioctl+0x4ec/0xef0
 __x64_sys_ioctl+0x82/0xa0
 do_syscall_64+0x2b/0x80
 entry_SYSCALL_64_after_hwframe+0x46/0xb0

This assertion triggers in xfs_dirattr_mark_sick when the caller passes
in a whichfork value that is neither of XFS_{DATA,ATTR}_FORK.  The cause
of this is that xchk_directory_blocks only partially initializes the
xfs_da_args structure that is passed to xfs_dir2_isblock.  If the data
fork is not correct, the XFS_IS_CORRUPT clause will trigger.  My
development branch reports this failure to the health monitoring
subsystem, which accesses the uninitialized args->whichfork field,
leading the the assertion tripping.  We really shouldn't be passing
random stack contents around, so the solution here is to force the
compiler to zero-initialize the struct.

Found by fuzzing u3.bmx[0].blockcount = middlebit on xfs/1554.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/scrub/dir.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index 5c87800ab223..d1b0f23c2c59 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -666,7 +666,12 @@ xchk_directory_blocks(
 	struct xfs_scrub	*sc)
 {
 	struct xfs_bmbt_irec	got;
-	struct xfs_da_args	args;
+	struct xfs_da_args	args = {
+		.dp		= sc ->ip,
+		.whichfork	= XFS_DATA_FORK,
+		.geo		= sc->mp->m_dir_geo,
+		.trans		= sc->tp,
+	};
 	struct xfs_ifork	*ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
 	struct xfs_mount	*mp = sc->mp;
 	xfs_fileoff_t		leaf_lblk;
@@ -689,9 +694,6 @@ xchk_directory_blocks(
 	free_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_FREE_OFFSET);
 
 	/* Is this a block dir? */
-	args.dp = sc->ip;
-	args.geo = mp->m_dir_geo;
-	args.trans = sc->tp;
 	error = xfs_dir2_isblock(&args, &is_block);
 	if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
 		goto out;
-- 
cgit 


From be1317fdb8d4e3ccbac43e199b360c248c600d99 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Sun, 6 Nov 2022 17:03:14 -0800
Subject: xfs: don't track the AGFL buffer in the scrub AG context

While scrubbing an allocation group, we don't need to hold the AGFL
buffer as part of the scrub context.  All that is necessary to lock an
AG is to hold the AGI and AGF buffers, so fix all the existing users of
the AGFL buffer to grab them only when necessary.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/scrub/agheader.c        | 47 ++++++++++++++++++++++++++----------------
 fs/xfs/scrub/agheader_repair.c |  1 -
 fs/xfs/scrub/common.c          |  8 -------
 fs/xfs/scrub/repair.c          | 11 +++++-----
 fs/xfs/scrub/scrub.h           |  1 -
 5 files changed, 35 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index b7b838bd4ba4..af284baa6f4c 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -609,9 +609,16 @@ out:
 /* AGFL */
 
 struct xchk_agfl_info {
-	unsigned int		sz_entries;
+	/* Number of AGFL entries that the AGF claims are in use. */
+	unsigned int		agflcount;
+
+	/* Number of AGFL entries that we found. */
 	unsigned int		nr_entries;
+
+	/* Buffer to hold AGFL entries for extent checking. */
 	xfs_agblock_t		*entries;
+
+	struct xfs_buf		*agfl_bp;
 	struct xfs_scrub	*sc;
 };
 
@@ -641,10 +648,10 @@ xchk_agfl_block(
 	struct xfs_scrub	*sc = sai->sc;
 
 	if (xfs_verify_agbno(sc->sa.pag, agbno) &&
-	    sai->nr_entries < sai->sz_entries)
+	    sai->nr_entries < sai->agflcount)
 		sai->entries[sai->nr_entries++] = agbno;
 	else
-		xchk_block_set_corrupt(sc, sc->sa.agfl_bp);
+		xchk_block_set_corrupt(sc, sai->agfl_bp);
 
 	xchk_agfl_block_xref(sc, agbno);
 
@@ -696,19 +703,26 @@ int
 xchk_agfl(
 	struct xfs_scrub	*sc)
 {
-	struct xchk_agfl_info	sai;
+	struct xchk_agfl_info	sai = {
+		.sc		= sc,
+	};
 	struct xfs_agf		*agf;
 	xfs_agnumber_t		agno = sc->sm->sm_agno;
-	unsigned int		agflcount;
 	unsigned int		i;
 	int			error;
 
+	/* Lock the AGF and AGI so that nobody can touch this AG. */
 	error = xchk_ag_read_headers(sc, agno, &sc->sa);
 	if (!xchk_process_error(sc, agno, XFS_AGFL_BLOCK(sc->mp), &error))
-		goto out;
+		return error;
 	if (!sc->sa.agf_bp)
 		return -EFSCORRUPTED;
-	xchk_buffer_recheck(sc, sc->sa.agfl_bp);
+
+	/* Try to read the AGFL, and verify its structure if we get it. */
+	error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &sai.agfl_bp);
+	if (!xchk_process_error(sc, agno, XFS_AGFL_BLOCK(sc->mp), &error))
+		return error;
+	xchk_buffer_recheck(sc, sai.agfl_bp);
 
 	xchk_agfl_xref(sc);
 
@@ -717,24 +731,21 @@ xchk_agfl(
 
 	/* Allocate buffer to ensure uniqueness of AGFL entries. */
 	agf = sc->sa.agf_bp->b_addr;
-	agflcount = be32_to_cpu(agf->agf_flcount);
-	if (agflcount > xfs_agfl_size(sc->mp)) {
+	sai.agflcount = be32_to_cpu(agf->agf_flcount);
+	if (sai.agflcount > xfs_agfl_size(sc->mp)) {
 		xchk_block_set_corrupt(sc, sc->sa.agf_bp);
 		goto out;
 	}
-	memset(&sai, 0, sizeof(sai));
-	sai.sc = sc;
-	sai.sz_entries = agflcount;
-	sai.entries = kmem_zalloc(sizeof(xfs_agblock_t) * agflcount,
-			KM_MAYFAIL);
+	sai.entries = kvcalloc(sai.agflcount, sizeof(xfs_agblock_t),
+			GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 	if (!sai.entries) {
 		error = -ENOMEM;
 		goto out;
 	}
 
 	/* Check the blocks in the AGFL. */
-	error = xfs_agfl_walk(sc->mp, sc->sa.agf_bp->b_addr,
-			sc->sa.agfl_bp, xchk_agfl_block, &sai);
+	error = xfs_agfl_walk(sc->mp, sc->sa.agf_bp->b_addr, sai.agfl_bp,
+			xchk_agfl_block, &sai);
 	if (error == -ECANCELED) {
 		error = 0;
 		goto out_free;
@@ -742,7 +753,7 @@ xchk_agfl(
 	if (error)
 		goto out_free;
 
-	if (agflcount != sai.nr_entries) {
+	if (sai.agflcount != sai.nr_entries) {
 		xchk_block_set_corrupt(sc, sc->sa.agf_bp);
 		goto out_free;
 	}
@@ -758,7 +769,7 @@ xchk_agfl(
 	}
 
 out_free:
-	kmem_free(sai.entries);
+	kvfree(sai.entries);
 out:
 	return error;
 }
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index 1b0b4e243f77..2e75ff9b5b2e 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -697,7 +697,6 @@ xrep_agfl(
 	 * freespace overflow to the freespace btrees.
 	 */
 	sc->sa.agf_bp = agf_bp;
-	sc->sa.agfl_bp = agfl_bp;
 	error = xrep_roll_ag_trans(sc);
 	if (error)
 		goto err;
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 9bbbf20f401b..ad70f29233c3 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -424,10 +424,6 @@ xchk_ag_read_headers(
 	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF))
 		return error;
 
-	error = xfs_alloc_read_agfl(sa->pag, sc->tp, &sa->agfl_bp);
-	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGFL))
-		return error;
-
 	return 0;
 }
 
@@ -515,10 +511,6 @@ xchk_ag_free(
 	struct xchk_ag		*sa)
 {
 	xchk_ag_btcur_free(sa);
-	if (sa->agfl_bp) {
-		xfs_trans_brelse(sc->tp, sa->agfl_bp);
-		sa->agfl_bp = NULL;
-	}
 	if (sa->agf_bp) {
 		xfs_trans_brelse(sc->tp, sa->agf_bp);
 		sa->agf_bp = NULL;
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index c18bd039fce9..2ada7fc1c398 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -126,8 +126,6 @@ xrep_roll_ag_trans(
 		xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
 	if (sc->sa.agf_bp)
 		xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
-	if (sc->sa.agfl_bp)
-		xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
 
 	/*
 	 * Roll the transaction.  We still own the buffer and the buffer lock
@@ -145,8 +143,6 @@ xrep_roll_ag_trans(
 		xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
 	if (sc->sa.agf_bp)
 		xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
-	if (sc->sa.agfl_bp)
-		xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
 
 	return 0;
 }
@@ -498,6 +494,7 @@ xrep_put_freelist(
 	struct xfs_scrub	*sc,
 	xfs_agblock_t		agbno)
 {
+	struct xfs_buf		*agfl_bp;
 	int			error;
 
 	/* Make sure there's space on the freelist. */
@@ -516,8 +513,12 @@ xrep_put_freelist(
 		return error;
 
 	/* Put the block on the AGFL. */
+	error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
+	if (error)
+		return error;
+
 	error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp,
-			sc->sa.agfl_bp, agbno, 0);
+			agfl_bp, agbno, 0);
 	if (error)
 		return error;
 	xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1,
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 3de5287e98d8..151567f88366 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -39,7 +39,6 @@ struct xchk_ag {
 
 	/* AG btree roots */
 	struct xfs_buf		*agf_bp;
-	struct xfs_buf		*agfl_bp;
 	struct xfs_buf		*agi_bp;
 
 	/* AG btrees */
-- 
cgit 


From 3e59c0103e66d6e687a8b47fd70169542aba938e Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Sun, 6 Nov 2022 17:03:14 -0800
Subject: xfs: log the AGI/AGF buffers when rolling transactions during an AG
 repair

Currently, the only way to lock an allocation group is to hold the AGI
and AGF buffers.  If a repair needs to roll the transaction while
repairing some AG metadata, it maintains that lock by holding the two
buffers across the transaction roll and joins them afterwards.

However, repair is not like other parts of XFS that employ the bhold -
roll - bjoin sequence because it's possible that the AGI or AGF buffers
are not actually dirty before the roll.  This presents two problems --
First, we need to redirty those buffers to keep them moving along in the
log to avoid pinning the log tail.  Second, a clean buffer log item can
detach from the buffer.  If this happens, the buffer type state is
discarded along with the bli and must be reattached before the next time
the buffer is logged.   If it is not, the logging code will complain and
log recovery will not work properly.

An earlier version of this patch tried to fix the second problem by
re-setting the buffer type in the bli after joining the buffer to the
new transaction, but that looked weird and didn't solve the first
problem.  Instead, solve both problems by logging the buffer before
rolling the transaction.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/scrub/repair.c | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 2ada7fc1c398..22335619c84e 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -121,24 +121,36 @@ xrep_roll_ag_trans(
 {
 	int			error;
 
-	/* Keep the AG header buffers locked so we can keep going. */
-	if (sc->sa.agi_bp)
+	/*
+	 * Keep the AG header buffers locked while we roll the transaction.
+	 * Ensure that both AG buffers are dirty and held when we roll the
+	 * transaction so that they move forward in the log without losing the
+	 * bli (and hence the bli type) when the transaction commits.
+	 *
+	 * Normal code would never hold clean buffers across a roll, but repair
+	 * needs both buffers to maintain a total lock on the AG.
+	 */
+	if (sc->sa.agi_bp) {
+		xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, XFS_AGI_MAGICNUM);
 		xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
-	if (sc->sa.agf_bp)
+	}
+
+	if (sc->sa.agf_bp) {
+		xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_MAGICNUM);
 		xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
+	}
 
 	/*
-	 * Roll the transaction.  We still own the buffer and the buffer lock
-	 * regardless of whether or not the roll succeeds.  If the roll fails,
-	 * the buffers will be released during teardown on our way out of the
-	 * kernel.  If it succeeds, we join them to the new transaction and
-	 * move on.
+	 * Roll the transaction.  We still hold the AG header buffers locked
+	 * regardless of whether or not that succeeds.  On failure, the buffers
+	 * will be released during teardown on our way out of the kernel.  If
+	 * successful, join the buffers to the new transaction and move on.
 	 */
 	error = xfs_trans_roll(&sc->tp);
 	if (error)
 		return error;
 
-	/* Join AG headers to the new transaction. */
+	/* Join the AG headers to the new transaction. */
 	if (sc->sa.agi_bp)
 		xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
 	if (sc->sa.agf_bp)
-- 
cgit 


From b255fab0f80cc65a334fcd90cd278673cddbc988 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Sun, 6 Nov 2022 17:03:14 -0800
Subject: xfs: make AGFL repair function avoid crosslinked blocks

Teach the AGFL repair function to check each block of the proposed AGFL
against the rmap btree.  If the rmapbt finds any mappings that are not
OWN_AG, strike that block from the list.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/scrub/agheader_repair.c | 78 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 66 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index 2e75ff9b5b2e..82ceb60ea5fc 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -442,12 +442,18 @@ out_revert:
 /* AGFL */
 
 struct xrep_agfl {
+	/* Bitmap of alleged AGFL blocks that we're not going to add. */
+	struct xbitmap		crossed;
+
 	/* Bitmap of other OWN_AG metadata blocks. */
 	struct xbitmap		agmetablocks;
 
 	/* Bitmap of free space. */
 	struct xbitmap		*freesp;
 
+	/* rmapbt cursor for finding crosslinked blocks */
+	struct xfs_btree_cur	*rmap_cur;
+
 	struct xfs_scrub	*sc;
 };
 
@@ -477,6 +483,41 @@ xrep_agfl_walk_rmap(
 	return xbitmap_set_btcur_path(&ra->agmetablocks, cur);
 }
 
+/* Strike out the blocks that are cross-linked according to the rmapbt. */
+STATIC int
+xrep_agfl_check_extent(
+	struct xrep_agfl	*ra,
+	uint64_t		start,
+	uint64_t		len)
+{
+	xfs_agblock_t		agbno = XFS_FSB_TO_AGBNO(ra->sc->mp, start);
+	xfs_agblock_t		last_agbno = agbno + len - 1;
+	int			error;
+
+	ASSERT(XFS_FSB_TO_AGNO(ra->sc->mp, start) == ra->sc->sa.pag->pag_agno);
+
+	while (agbno <= last_agbno) {
+		bool		other_owners;
+
+		error = xfs_rmap_has_other_keys(ra->rmap_cur, agbno, 1,
+				&XFS_RMAP_OINFO_AG, &other_owners);
+		if (error)
+			return error;
+
+		if (other_owners) {
+			error = xbitmap_set(&ra->crossed, agbno, 1);
+			if (error)
+				return error;
+		}
+
+		if (xchk_should_terminate(ra->sc, &error))
+			return error;
+		agbno++;
+	}
+
+	return 0;
+}
+
 /*
  * Map out all the non-AGFL OWN_AG space in this AG so that we can deduce
  * which blocks belong to the AGFL.
@@ -496,44 +537,58 @@ xrep_agfl_collect_blocks(
 	struct xrep_agfl	ra;
 	struct xfs_mount	*mp = sc->mp;
 	struct xfs_btree_cur	*cur;
+	struct xbitmap_range	*br, *n;
 	int			error;
 
 	ra.sc = sc;
 	ra.freesp = agfl_extents;
 	xbitmap_init(&ra.agmetablocks);
+	xbitmap_init(&ra.crossed);
 
 	/* Find all space used by the free space btrees & rmapbt. */
 	cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
 	error = xfs_rmap_query_all(cur, xrep_agfl_walk_rmap, &ra);
-	if (error)
-		goto err;
 	xfs_btree_del_cursor(cur, error);
+	if (error)
+		goto out_bmp;
 
 	/* Find all blocks currently being used by the bnobt. */
 	cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
 			sc->sa.pag, XFS_BTNUM_BNO);
 	error = xbitmap_set_btblocks(&ra.agmetablocks, cur);
-	if (error)
-		goto err;
 	xfs_btree_del_cursor(cur, error);
+	if (error)
+		goto out_bmp;
 
 	/* Find all blocks currently being used by the cntbt. */
 	cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
 			sc->sa.pag, XFS_BTNUM_CNT);
 	error = xbitmap_set_btblocks(&ra.agmetablocks, cur);
-	if (error)
-		goto err;
-
 	xfs_btree_del_cursor(cur, error);
+	if (error)
+		goto out_bmp;
 
 	/*
 	 * Drop the freesp meta blocks that are in use by btrees.
 	 * The remaining blocks /should/ be AGFL blocks.
 	 */
 	error = xbitmap_disunion(agfl_extents, &ra.agmetablocks);
-	xbitmap_destroy(&ra.agmetablocks);
 	if (error)
-		return error;
+		goto out_bmp;
+
+	/* Strike out the blocks that are cross-linked. */
+	ra.rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
+	for_each_xbitmap_extent(br, n, agfl_extents) {
+		error = xrep_agfl_check_extent(&ra, br->start, br->len);
+		if (error)
+			break;
+	}
+	xfs_btree_del_cursor(ra.rmap_cur, error);
+	if (error)
+		goto out_bmp;
+	error = xbitmap_disunion(agfl_extents, &ra.crossed);
+	if (error)
+		goto out_bmp;
 
 	/*
 	 * Calculate the new AGFL size.  If we found more blocks than fit in
@@ -541,11 +596,10 @@ xrep_agfl_collect_blocks(
 	 */
 	*flcount = min_t(uint64_t, xbitmap_hweight(agfl_extents),
 			 xfs_agfl_size(mp));
-	return 0;
 
-err:
+out_bmp:
+	xbitmap_destroy(&ra.crossed);
 	xbitmap_destroy(&ra.agmetablocks);
-	xfs_btree_del_cursor(cur, error);
 	return error;
 }
 
-- 
cgit 


From 48ff40458f871fb19e7b1b40e0e5084b8751d9cb Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Sun, 6 Nov 2022 17:03:15 -0800
Subject: xfs: standardize GFP flags usage in online scrub

Memory allocation usage is the same throughout online fsck -- we want
kernel memory, we have to be able to back out if we can't allocate
memory, and we don't want to spray dmesg with memory allocation failure
reports.  Standardize the GFP flag usage and document these requirements.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/scrub/agheader.c | 2 +-
 fs/xfs/scrub/attr.c     | 9 ++++-----
 fs/xfs/scrub/scrub.h    | 9 +++++++++
 fs/xfs/scrub/symlink.c  | 2 +-
 4 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index af284baa6f4c..4dd52b15f09c 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -737,7 +737,7 @@ xchk_agfl(
 		goto out;
 	}
 	sai.entries = kvcalloc(sai.agflcount, sizeof(xfs_agblock_t),
-			GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+			       XCHK_GFP_FLAGS);
 	if (!sai.entries) {
 		error = -ENOMEM;
 		goto out;
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index b6f0c9f3f124..11b2593a2be7 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -79,7 +79,8 @@ xchk_setup_xattr(
 	 * without the inode lock held, which means we can sleep.
 	 */
 	if (sc->flags & XCHK_TRY_HARDER) {
-		error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, GFP_KERNEL);
+		error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX,
+				XCHK_GFP_FLAGS);
 		if (error)
 			return error;
 	}
@@ -138,8 +139,7 @@ xchk_xattr_listent(
 	 * doesn't work, we overload the seen_enough variable to convey
 	 * the error message back to the main scrub function.
 	 */
-	error = xchk_setup_xattr_buf(sx->sc, valuelen,
-			GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+	error = xchk_setup_xattr_buf(sx->sc, valuelen, XCHK_GFP_FLAGS);
 	if (error == -ENOMEM)
 		error = -EDEADLOCK;
 	if (error) {
@@ -324,8 +324,7 @@ xchk_xattr_block(
 		return 0;
 
 	/* Allocate memory for block usage checking. */
-	error = xchk_setup_xattr_buf(ds->sc, 0,
-			GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+	error = xchk_setup_xattr_buf(ds->sc, 0, XCHK_GFP_FLAGS);
 	if (error == -ENOMEM)
 		return -EDEADLOCK;
 	if (error)
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 151567f88366..a0f097b8acb0 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -8,6 +8,15 @@
 
 struct xfs_scrub;
 
+/*
+ * Standard flags for allocating memory within scrub.  NOFS context is
+ * configured by the process allocation scope.  Scrub and repair must be able
+ * to back out gracefully if there isn't enough memory.  Force-cast to avoid
+ * complaints from static checkers.
+ */
+#define XCHK_GFP_FLAGS	((__force gfp_t)(GFP_KERNEL | __GFP_NOWARN | \
+					 __GFP_RETRY_MAYFAIL))
+
 /* Type info and names for the scrub types. */
 enum xchk_type {
 	ST_NONE = 1,	/* disabled */
diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
index 75311f8daeeb..c1c99ffe7408 100644
--- a/fs/xfs/scrub/symlink.c
+++ b/fs/xfs/scrub/symlink.c
@@ -21,7 +21,7 @@ xchk_setup_symlink(
 	struct xfs_scrub	*sc)
 {
 	/* Allocate the buffer without the inode lock held. */
-	sc->buf = kvzalloc(XFS_SYMLINK_MAXLEN + 1, GFP_KERNEL);
+	sc->buf = kvzalloc(XFS_SYMLINK_MAXLEN + 1, XCHK_GFP_FLAGS);
 	if (!sc->buf)
 		return -ENOMEM;
 
-- 
cgit 


From fcd2a43488d5a211aec94e28369b2a72c28258a2 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Sun, 6 Nov 2022 17:03:15 -0800
Subject: xfs: initialize the check_owner object fully

Initialize the check_owner list head so that we don't corrupt the list.
Reduce the scope of the object pointer.

Fixes: 858333dcf021 ("xfs: check btree block ownership with bnobt/rmapbt when scrubbing btree")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/scrub/btree.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index 2f4519590dc1..075ff3071122 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c
@@ -408,7 +408,6 @@ xchk_btree_check_owner(
 	struct xfs_buf		*bp)
 {
 	struct xfs_btree_cur	*cur = bs->cur;
-	struct check_owner	*co;
 
 	/*
 	 * In theory, xfs_btree_get_block should only give us a null buffer
@@ -431,10 +430,14 @@ xchk_btree_check_owner(
 	 * later scanning.
 	 */
 	if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) {
+		struct check_owner	*co;
+
 		co = kmem_alloc(sizeof(struct check_owner),
 				KM_MAYFAIL);
 		if (!co)
 			return -ENOMEM;
+
+		INIT_LIST_HEAD(&co->list);
 		co->level = level;
 		co->daddr = xfs_buf_daddr(bp);
 		list_add_tail(&co->list, &bs->to_check);
-- 
cgit 


From 306195f355bbdcc3eff6cffac05bcd93a5e419ed Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Sun, 6 Nov 2022 17:03:16 -0800
Subject: xfs: pivot online scrub away from kmem.[ch]

Convert all the online scrub code to use the Linux slab allocator
functions directly instead of going through the kmem wrappers.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/scrub/agheader_repair.c |  2 +-
 fs/xfs/scrub/attr.c            |  2 +-
 fs/xfs/scrub/bitmap.c          | 11 ++++++-----
 fs/xfs/scrub/btree.c           |  9 ++++-----
 fs/xfs/scrub/dabtree.c         |  4 ++--
 fs/xfs/scrub/fscounters.c      |  2 +-
 fs/xfs/scrub/refcount.c        | 12 ++++++------
 fs/xfs/scrub/scrub.c           |  6 +++---
 8 files changed, 24 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index 82ceb60ea5fc..d75d82151eeb 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -685,7 +685,7 @@ xrep_agfl_init_header(
 		if (br->len)
 			break;
 		list_del(&br->list);
-		kmem_free(br);
+		kfree(br);
 	}
 
 	/* Write new AGFL to disk. */
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index 11b2593a2be7..31529b9bf389 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -49,7 +49,7 @@ xchk_setup_xattr_buf(
 	if (ab) {
 		if (sz <= ab->sz)
 			return 0;
-		kmem_free(ab);
+		kvfree(ab);
 		sc->buf = NULL;
 	}
 
diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c
index b89bf9de9b1c..a255f09e9f0a 100644
--- a/fs/xfs/scrub/bitmap.c
+++ b/fs/xfs/scrub/bitmap.c
@@ -10,6 +10,7 @@
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
 #include "xfs_btree.h"
+#include "scrub/scrub.h"
 #include "scrub/bitmap.h"
 
 /*
@@ -25,7 +26,7 @@ xbitmap_set(
 {
 	struct xbitmap_range	*bmr;
 
-	bmr = kmem_alloc(sizeof(struct xbitmap_range), KM_MAYFAIL);
+	bmr = kmalloc(sizeof(struct xbitmap_range), XCHK_GFP_FLAGS);
 	if (!bmr)
 		return -ENOMEM;
 
@@ -47,7 +48,7 @@ xbitmap_destroy(
 
 	for_each_xbitmap_extent(bmr, n, bitmap) {
 		list_del(&bmr->list);
-		kmem_free(bmr);
+		kfree(bmr);
 	}
 }
 
@@ -174,15 +175,15 @@ xbitmap_disunion(
 			/* Total overlap, just delete ex. */
 			lp = lp->next;
 			list_del(&br->list);
-			kmem_free(br);
+			kfree(br);
 			break;
 		case 0:
 			/*
 			 * Deleting from the middle: add the new right extent
 			 * and then shrink the left extent.
 			 */
-			new_br = kmem_alloc(sizeof(struct xbitmap_range),
-					KM_MAYFAIL);
+			new_br = kmalloc(sizeof(struct xbitmap_range),
+					XCHK_GFP_FLAGS);
 			if (!new_br) {
 				error = -ENOMEM;
 				goto out;
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index 075ff3071122..0fd36d5b4646 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c
@@ -432,8 +432,7 @@ xchk_btree_check_owner(
 	if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) {
 		struct check_owner	*co;
 
-		co = kmem_alloc(sizeof(struct check_owner),
-				KM_MAYFAIL);
+		co = kmalloc(sizeof(struct check_owner), XCHK_GFP_FLAGS);
 		if (!co)
 			return -ENOMEM;
 
@@ -652,7 +651,7 @@ xchk_btree(
 		xchk_btree_set_corrupt(sc, cur, 0);
 		return 0;
 	}
-	bs = kmem_zalloc(cur_sz, KM_NOFS | KM_MAYFAIL);
+	bs = kzalloc(cur_sz, XCHK_GFP_FLAGS);
 	if (!bs)
 		return -ENOMEM;
 	bs->cur = cur;
@@ -743,9 +742,9 @@ out:
 			error = xchk_btree_check_block_owner(bs, co->level,
 					co->daddr);
 		list_del(&co->list);
-		kmem_free(co);
+		kfree(co);
 	}
-	kmem_free(bs);
+	kfree(bs);
 
 	return error;
 }
diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
index 84fe3d33d699..d17cee177085 100644
--- a/fs/xfs/scrub/dabtree.c
+++ b/fs/xfs/scrub/dabtree.c
@@ -486,7 +486,7 @@ xchk_da_btree(
 		return 0;
 
 	/* Set up initial da state. */
-	ds = kmem_zalloc(sizeof(struct xchk_da_btree), KM_NOFS | KM_MAYFAIL);
+	ds = kzalloc(sizeof(struct xchk_da_btree), XCHK_GFP_FLAGS);
 	if (!ds)
 		return -ENOMEM;
 	ds->dargs.dp = sc->ip;
@@ -591,6 +591,6 @@ out:
 
 out_state:
 	xfs_da_state_free(ds->state);
-	kmem_free(ds);
+	kfree(ds);
 	return error;
 }
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index 6a6f8fe7f87c..3c56f5890da4 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -116,7 +116,7 @@ xchk_setup_fscounters(
 	struct xchk_fscounters	*fsc;
 	int			error;
 
-	sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), 0);
+	sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS);
 	if (!sc->buf)
 		return -ENOMEM;
 	fsc = sc->buf;
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index a26ee0f24ef2..d9c1b3cea4a5 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -127,8 +127,8 @@ xchk_refcountbt_rmap_check(
 		 * is healthy each rmap_irec we see will be in agbno order
 		 * so we don't need insertion sort here.
 		 */
-		frag = kmem_alloc(sizeof(struct xchk_refcnt_frag),
-				KM_MAYFAIL);
+		frag = kmalloc(sizeof(struct xchk_refcnt_frag),
+				XCHK_GFP_FLAGS);
 		if (!frag)
 			return -ENOMEM;
 		memcpy(&frag->rm, rec, sizeof(frag->rm));
@@ -215,7 +215,7 @@ xchk_refcountbt_process_rmap_fragments(
 				continue;
 			}
 			list_del(&frag->list);
-			kmem_free(frag);
+			kfree(frag);
 			nr++;
 		}
 
@@ -257,11 +257,11 @@ done:
 	/* Delete fragments and work list. */
 	list_for_each_entry_safe(frag, n, &worklist, list) {
 		list_del(&frag->list);
-		kmem_free(frag);
+		kfree(frag);
 	}
 	list_for_each_entry_safe(frag, n, &refchk->fragments, list) {
 		list_del(&frag->list);
-		kmem_free(frag);
+		kfree(frag);
 	}
 }
 
@@ -306,7 +306,7 @@ xchk_refcountbt_xref_rmap(
 out_free:
 	list_for_each_entry_safe(frag, n, &refchk.fragments, list) {
 		list_del(&frag->list);
-		kmem_free(frag);
+		kfree(frag);
 	}
 }
 
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 2e8e400f10a9..07a7a75f987f 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -174,7 +174,7 @@ xchk_teardown(
 	if (sc->flags & XCHK_REAPING_DISABLED)
 		xchk_start_reaping(sc);
 	if (sc->buf) {
-		kmem_free(sc->buf);
+		kvfree(sc->buf);
 		sc->buf = NULL;
 	}
 	return error;
@@ -467,7 +467,7 @@ xfs_scrub_metadata(
 	xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SCRUB,
  "EXPERIMENTAL online scrub feature in use. Use at your own risk!");
 
-	sc = kmem_zalloc(sizeof(struct xfs_scrub), KM_NOFS | KM_MAYFAIL);
+	sc = kzalloc(sizeof(struct xfs_scrub), XCHK_GFP_FLAGS);
 	if (!sc) {
 		error = -ENOMEM;
 		goto out;
@@ -557,7 +557,7 @@ out_nofix:
 out_teardown:
 	error = xchk_teardown(sc, error);
 out_sc:
-	kmem_free(sc);
+	kfree(sc);
 out:
 	trace_xchk_done(XFS_I(file_inode(file)), sm, error);
 	if (error == -EFSCORRUPTED || error == -EFSBADCRC) {
-- 
cgit 


From a7a0f9a5503f4da3b6489583ce4ef9abc0ab2475 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Sun, 6 Nov 2022 17:03:16 -0800
Subject: xfs: return EINTR when a fatal signal terminates scrub

If the program calling online fsck is terminated with a fatal signal,
bail out to userspace by returning EINTR, not EAGAIN.  EAGAIN is used by
scrubbers to indicate that we should try again with more resources
locked, and not to indicate that the operation was cancelled.  The
miswiring is mostly harmless, but it shows up in the trace data.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/scrub/common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 454145db10e7..b73648d81d23 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -25,7 +25,7 @@ xchk_should_terminate(
 
 	if (fatal_signal_pending(current)) {
 		if (*error == 0)
-			*error = -EAGAIN;
+			*error = -EINTR;
 		return true;
 	}
 	return false;
-- 
cgit 


From 0a713bd41ea2b19904232b9c5278012c4361bc04 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Sun, 6 Nov 2022 17:03:17 -0800
Subject: xfs: fix return code when fatal signal encountered during dquot scrub

If the scrub process is sent a fatal signal while we're checking dquots,
the predicate for this will set the error code to -EINTR.  Don't then
squash that into -ECANCELED, because the wrong errno turns up in the
trace output.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/scrub/quota.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index 21b4c9006859..0b643ff32b22 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -84,7 +84,7 @@ xchk_quota_item(
 	int			error = 0;
 
 	if (xchk_should_terminate(sc, &error))
-		return -ECANCELED;
+		return error;
 
 	/*
 	 * Except for the root dquot, the actual dquot we got must either have
-- 
cgit 


From 6bf2f87915970160ded16c310e2e8887deff97a2 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Sun, 6 Nov 2022 17:03:17 -0800
Subject: xfs: don't retry repairs harder when EAGAIN is returned

Repair functions will not return EAGAIN -- if they were not able to
obtain resources, they should return EDEADLOCK (like the rest of online
fsck) to signal that we need to grab all the resources and try again.
Hence we don't need to deal with this case except as a debugging
assertion.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/scrub/repair.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 22335619c84e..7323bd9fddfb 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -61,7 +61,6 @@ xrep_attempt(
 		sc->flags |= XREP_ALREADY_FIXED;
 		return -EAGAIN;
 	case -EDEADLOCK:
-	case -EAGAIN:
 		/* Tell the caller to try again having grabbed all the locks. */
 		if (!(sc->flags & XCHK_TRY_HARDER)) {
 			sc->flags |= XCHK_TRY_HARDER;
@@ -74,6 +73,11 @@ xrep_attempt(
 		 */
 		return -EFSCORRUPTED;
 	default:
+		/*
+		 * EAGAIN tells the caller to re-scrub, so we cannot return
+		 * that here.
+		 */
+		ASSERT(error != -EAGAIN);
 		return error;
 	}
 }
-- 
cgit 


From 93b0c58ed04b6cbe45354f23bb5628fff31f9084 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Sun, 6 Nov 2022 17:03:17 -0800
Subject: xfs: don't return -EFSCORRUPTED from repair when resources cannot be
 grabbed

If we tried to repair something but the repair failed with -EDEADLOCK,
that means that the repair function couldn't grab some resource it
needed and wants us to try again.  If we try again (with TRY_HARDER) but
still can't get all the resources we need, the repair fails and errors
remain on the filesystem.

Right now, repair returns the -EDEADLOCK to the caller as -EFSCORRUPTED,
which results in XFS_SCRUB_OFLAG_CORRUPT being passed out to userspace.
This is not correct because repair has not determined that anything is
corrupt.  If the repair had been invoked on an object that could be
optimized but wasn't corrupt (OFLAG_PREEN), the inability to grab
resources will be reported to userspace as corrupt metadata, and users
will be unnecessarily alarmed that their suboptimal metadata turned into
a corruption.

Fix this by returning zero so that the results of the actual scrub will
be copied back out to userspace.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/scrub/repair.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 7323bd9fddfb..4b92f9253ccd 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -69,9 +69,9 @@ xrep_attempt(
 		/*
 		 * We tried harder but still couldn't grab all the resources
 		 * we needed to fix it.  The corruption has not been fixed,
-		 * so report back to userspace.
+		 * so exit to userspace with the scan's output flags unchanged.
 		 */
-		return -EFSCORRUPTED;
+		return 0;
 	default:
 		/*
 		 * EAGAIN tells the caller to re-scrub, so we cannot return
-- 
cgit 


From 9e13975bb0620c2bfa1a4d2943e7eb8514f7708e Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Sun, 6 Nov 2022 17:03:18 -0800
Subject: xfs: load rtbitmap and rtsummary extent mapping btrees at mount time

It turns out that GETFSMAP and online fsck have had a bug for years due
to their use of ILOCK_SHARED to coordinate their linear scans of the
realtime bitmap.  If the bitmap file's data fork happens to be in BTREE
format and the scan occurs immediately after mounting, the incore bmbt
will not be populated, leading to ASSERTs tripping over the incorrect
inode state.  Because the bitmap scans always lock bitmap buffers in
increasing order of file offset, it is appropriate for these two callers
to take a shared ILOCK to improve scalability.

To fix this problem, load both data and attr fork state into memory when
mounting the realtime inodes.  Realtime metadata files aren't supposed
to have an attr fork so the second step is likely a nop.

On most filesystems this is unlikely since the rtbitmap data fork is
usually in extents format, but it's possible to craft a filesystem that
will by fragmenting the free space in the data section and growfsing the
rt section.

Fixes: 4c934c7dd60c ("xfs: report realtime space information via the rtbitmap")
Also-Fixes: 46d9bfb5e706 ("xfs: cross-reference the realtime bitmap")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_rtalloc.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 52 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 292d5e54a92c..b0846204c436 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1325,6 +1325,41 @@ xfs_rtalloc_reinit_frextents(
 	return 0;
 }
 
+/*
+ * Read in the bmbt of an rt metadata inode so that we never have to load them
+ * at runtime.  This enables the use of shared ILOCKs for rtbitmap scans.  Use
+ * an empty transaction to avoid deadlocking on loops in the bmbt.
+ */
+static inline int
+xfs_rtmount_iread_extents(
+	struct xfs_inode	*ip,
+	unsigned int		lock_class)
+{
+	struct xfs_trans	*tp;
+	int			error;
+
+	error = xfs_trans_alloc_empty(ip->i_mount, &tp);
+	if (error)
+		return error;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL | lock_class);
+
+	error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
+	if (error)
+		goto out_unlock;
+
+	if (xfs_inode_has_attr_fork(ip)) {
+		error = xfs_iread_extents(tp, ip, XFS_ATTR_FORK);
+		if (error)
+			goto out_unlock;
+	}
+
+out_unlock:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL | lock_class);
+	xfs_trans_cancel(tp);
+	return error;
+}
+
 /*
  * Get the bitmap and summary inodes and the summary cache into the mount
  * structure at mount time.
@@ -1342,14 +1377,27 @@ xfs_rtmount_inodes(
 		return error;
 	ASSERT(mp->m_rbmip != NULL);
 
+	error = xfs_rtmount_iread_extents(mp->m_rbmip, XFS_ILOCK_RTBITMAP);
+	if (error)
+		goto out_rele_bitmap;
+
 	error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip);
-	if (error) {
-		xfs_irele(mp->m_rbmip);
-		return error;
-	}
+	if (error)
+		goto out_rele_bitmap;
 	ASSERT(mp->m_rsumip != NULL);
+
+	error = xfs_rtmount_iread_extents(mp->m_rsumip, XFS_ILOCK_RTSUM);
+	if (error)
+		goto out_rele_summary;
+
 	xfs_alloc_rsum_cache(mp, sbp->sb_rbmblocks);
 	return 0;
+
+out_rele_summary:
+	xfs_irele(mp->m_rsumip);
+out_rele_bitmap:
+	xfs_irele(mp->m_rbmip);
+	return error;
 }
 
 void
-- 
cgit 


From 5f369dc5b4eb2becbdfd08924dcaf00e391f4ea1 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Sun, 6 Nov 2022 17:03:18 -0800
Subject: xfs: make rtbitmap ILOCKing consistent when scanning the rt bitmap
 file

xfs_rtalloc_query_range scans the realtime bitmap file in order of
increasing file offset, so this caller can take ILOCK_SHARED on the rt
bitmap inode instead of ILOCK_EXCL.  This isn't going to yield any
practical benefits at mount time, but we'd like to make the locking
usage consistent around xfs_rtalloc_query_all calls.  Make all the
places we do this use the same xfs_ilock lockflags for consistency.

Fixes: 4c934c7dd60c ("xfs: report realtime space information via the rtbitmap")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_fsmap.c   | 4 ++--
 fs/xfs/xfs_rtalloc.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index d8337274c74d..88a88506ffff 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -524,7 +524,7 @@ xfs_getfsmap_rtdev_rtbitmap_query(
 	struct xfs_mount		*mp = tp->t_mountp;
 	int				error;
 
-	xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED);
+	xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
 
 	/*
 	 * Set up query parameters to return free rtextents covering the range
@@ -551,7 +551,7 @@ xfs_getfsmap_rtdev_rtbitmap_query(
 	if (error)
 		goto err;
 err:
-	xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED);
+	xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
 	return error;
 }
 
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index b0846204c436..16534e9873f6 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1311,10 +1311,10 @@ xfs_rtalloc_reinit_frextents(
 	uint64_t		val = 0;
 	int			error;
 
-	xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+	xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
 	error = xfs_rtalloc_query_all(mp, NULL, xfs_rtalloc_count_frextent,
 			&val);
-	xfs_iunlock(mp->m_rbmip, XFS_ILOCK_EXCL);
+	xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
 	if (error)
 		return error;
 
-- 
cgit 


From 11f97e684583469fc342a561387cc44fac4f9b1f Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Sun, 6 Nov 2022 17:03:19 -0800
Subject: xfs: skip fscounters comparisons when the scan is incomplete

If any part of the per-AG summary counter scan loop aborts without
collecting all of the data we need, the scrubber's observation data will
be invalid.  Set the incomplete flag so that we abort the scrub without
reporting false corruptions.  Document the data dependency here too.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/scrub/fscounters.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index 3c56f5890da4..eeb36ac2c6d2 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -138,6 +138,18 @@ xchk_setup_fscounters(
 	return xchk_trans_alloc(sc, 0);
 }
 
+/*
+ * Part 1: Collecting filesystem summary counts.  For each AG, we add its
+ * summary counts (total inodes, free inodes, free data blocks) to an incore
+ * copy of the overall filesystem summary counts.
+ *
+ * To avoid false corruption reports in part 2, any failure in this part must
+ * set the INCOMPLETE flag even when a negative errno is returned.  This care
+ * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED,
+ * ECANCELED) that are absorbed into a scrub state flag update by
+ * xchk_*_process_error.
+ */
+
 /* Count free space btree blocks manually for pre-lazysbcount filesystems. */
 static int
 xchk_fscount_btreeblks(
@@ -225,8 +237,10 @@ retry:
 	}
 	if (pag)
 		xfs_perag_put(pag);
-	if (error)
+	if (error) {
+		xchk_set_incomplete(sc);
 		return error;
+	}
 
 	/*
 	 * The global incore space reservation is taken from the incore
@@ -267,6 +281,11 @@ retry:
 	return 0;
 }
 
+/*
+ * Part 2: Comparing filesystem summary counters.  All we have to do here is
+ * sum the percpu counters and compare them to what we've observed.
+ */
+
 /*
  * Is the @counter reasonably close to the @expected value?
  *
-- 
cgit 


From e74331d6fa2c21a8ecccfe0648dad5193b83defe Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Sun, 6 Nov 2022 17:03:19 -0800
Subject: xfs: online checking of the free rt extent count

Teach the summary count checker to count the number of free realtime
extents and compare that to the superblock copy.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/scrub/fscounters.c | 86 +++++++++++++++++++++++++++++++++++++++++++++--
 fs/xfs/scrub/scrub.h      |  8 -----
 2 files changed, 84 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index eeb36ac2c6d2..4777e7b89fdc 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -14,6 +14,8 @@
 #include "xfs_health.h"
 #include "xfs_btree.h"
 #include "xfs_ag.h"
+#include "xfs_rtalloc.h"
+#include "xfs_inode.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -43,6 +45,16 @@
  * our tolerance for mismatch between expected and actual counter values.
  */
 
+struct xchk_fscounters {
+	struct xfs_scrub	*sc;
+	uint64_t		icount;
+	uint64_t		ifree;
+	uint64_t		fdblocks;
+	uint64_t		frextents;
+	unsigned long long	icount_min;
+	unsigned long long	icount_max;
+};
+
 /*
  * Since the expected value computation is lockless but only browses incore
  * values, the percpu counters should be fairly close to each other.  However,
@@ -120,6 +132,7 @@ xchk_setup_fscounters(
 	if (!sc->buf)
 		return -ENOMEM;
 	fsc = sc->buf;
+	fsc->sc = sc;
 
 	xfs_icount_range(sc->mp, &fsc->icount_min, &fsc->icount_max);
 
@@ -281,6 +294,59 @@ retry:
 	return 0;
 }
 
+#ifdef CONFIG_XFS_RT
+STATIC int
+xchk_fscount_add_frextent(
+	struct xfs_mount		*mp,
+	struct xfs_trans		*tp,
+	const struct xfs_rtalloc_rec	*rec,
+	void				*priv)
+{
+	struct xchk_fscounters		*fsc = priv;
+	int				error = 0;
+
+	fsc->frextents += rec->ar_extcount;
+
+	xchk_should_terminate(fsc->sc, &error);
+	return error;
+}
+
+/* Calculate the number of free realtime extents from the realtime bitmap. */
+STATIC int
+xchk_fscount_count_frextents(
+	struct xfs_scrub	*sc,
+	struct xchk_fscounters	*fsc)
+{
+	struct xfs_mount	*mp = sc->mp;
+	int			error;
+
+	fsc->frextents = 0;
+	if (!xfs_has_realtime(mp))
+		return 0;
+
+	xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+	error = xfs_rtalloc_query_all(sc->mp, sc->tp,
+			xchk_fscount_add_frextent, fsc);
+	if (error) {
+		xchk_set_incomplete(sc);
+		goto out_unlock;
+	}
+
+out_unlock:
+	xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+	return error;
+}
+#else
+STATIC int
+xchk_fscount_count_frextents(
+	struct xfs_scrub	*sc,
+	struct xchk_fscounters	*fsc)
+{
+	fsc->frextents = 0;
+	return 0;
+}
+#endif /* CONFIG_XFS_RT */
+
 /*
  * Part 2: Comparing filesystem summary counters.  All we have to do here is
  * sum the percpu counters and compare them to what we've observed.
@@ -352,16 +418,17 @@ xchk_fscounters(
 {
 	struct xfs_mount	*mp = sc->mp;
 	struct xchk_fscounters	*fsc = sc->buf;
-	int64_t			icount, ifree, fdblocks;
+	int64_t			icount, ifree, fdblocks, frextents;
 	int			error;
 
 	/* Snapshot the percpu counters. */
 	icount = percpu_counter_sum(&mp->m_icount);
 	ifree = percpu_counter_sum(&mp->m_ifree);
 	fdblocks = percpu_counter_sum(&mp->m_fdblocks);
+	frextents = percpu_counter_sum(&mp->m_frextents);
 
 	/* No negative values, please! */
-	if (icount < 0 || ifree < 0 || fdblocks < 0)
+	if (icount < 0 || ifree < 0 || fdblocks < 0 || frextents < 0)
 		xchk_set_corrupt(sc);
 
 	/* See if icount is obviously wrong. */
@@ -372,6 +439,10 @@ xchk_fscounters(
 	if (fdblocks > mp->m_sb.sb_dblocks)
 		xchk_set_corrupt(sc);
 
+	/* See if frextents is obviously wrong. */
+	if (frextents > mp->m_sb.sb_rextents)
+		xchk_set_corrupt(sc);
+
 	/*
 	 * If ifree exceeds icount by more than the minimum variance then
 	 * something's probably wrong with the counters.
@@ -386,6 +457,13 @@ xchk_fscounters(
 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
 		return 0;
 
+	/* Count the free extents counter for rt volumes. */
+	error = xchk_fscount_count_frextents(sc, fsc);
+	if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error))
+		return error;
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
+		return 0;
+
 	/* Compare the in-core counters with whatever we counted. */
 	if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, fsc->icount))
 		xchk_set_corrupt(sc);
@@ -397,5 +475,9 @@ xchk_fscounters(
 			fsc->fdblocks))
 		xchk_set_corrupt(sc);
 
+	if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents,
+			fsc->frextents))
+		xchk_set_corrupt(sc);
+
 	return 0;
 }
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index a0f097b8acb0..b4d391b4c938 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -169,12 +169,4 @@ void xchk_xref_is_used_rt_space(struct xfs_scrub *sc, xfs_rtblock_t rtbno,
 # define xchk_xref_is_used_rt_space(sc, rtbno, len) do { } while (0)
 #endif
 
-struct xchk_fscounters {
-	uint64_t		icount;
-	uint64_t		ifree;
-	uint64_t		fdblocks;
-	unsigned long long	icount_min;
-	unsigned long long	icount_max;
-};
-
 #endif	/* __XFS_SCRUB_SCRUB_H__ */
-- 
cgit 


From 033985b6fe875a7a971cf4e3941e1f3085ba037c Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Sun, 6 Nov 2022 17:03:19 -0800
Subject: xfs: fix perag loop in xchk_bmap_check_rmaps

sparse complains that we can return an uninitialized error from this
function and that pag could be uninitialized.  We know that there are no
zero-AG filesystems and hence we had to call xchk_bmap_check_ag_rmaps at
least once, so this is not actually possible, but I'm too worn out from
automated complaints from unsophisticated AIs so let's just fix this and
move on to more interesting problems, eh?

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/scrub/bmap.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index f0b9cb6506fd..cb203e083a4c 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -599,14 +599,14 @@ xchk_bmap_check_rmaps(
 
 	for_each_perag(sc->mp, agno, pag) {
 		error = xchk_bmap_check_ag_rmaps(sc, whichfork, pag);
-		if (error)
-			break;
-		if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
-			break;
+		if (error ||
+		    (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) {
+			xfs_perag_put(pag);
+			return error;
+		}
 	}
-	if (pag)
-		xfs_perag_put(pag);
-	return error;
+
+	return 0;
 }
 
 /*
-- 
cgit 


From 6a5777865eebee1b53d7ae0fd2fa9ec2c6596df6 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Sun, 6 Nov 2022 17:03:20 -0800
Subject: xfs: teach scrub to check for adjacent bmaps when rmap larger than
 bmap

When scrub is checking file fork mappings against rmap records and
the rmap record starts before or ends after the bmap record, check the
adjacent bmap records to make sure that they're adjacent to the one
we're checking.  This helps us to detect cases where the rmaps cover
territory that the bmaps do not.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/scrub/bmap.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 72 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index cb203e083a4c..a4a156dcaa8a 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -90,6 +90,7 @@ out:
 
 struct xchk_bmap_info {
 	struct xfs_scrub	*sc;
+	struct xfs_iext_cursor	icur;
 	xfs_fileoff_t		lastoff;
 	bool			is_rt;
 	bool			is_shared;
@@ -146,6 +147,48 @@ xchk_bmap_get_rmap(
 	return has_rmap;
 }
 
+static inline bool
+xchk_bmap_has_prev(
+	struct xchk_bmap_info	*info,
+	struct xfs_bmbt_irec	*irec)
+{
+	struct xfs_bmbt_irec	got;
+	struct xfs_ifork	*ifp;
+
+	ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork);
+
+	if (!xfs_iext_peek_prev_extent(ifp, &info->icur, &got))
+		return false;
+	if (got.br_startoff + got.br_blockcount != irec->br_startoff)
+		return false;
+	if (got.br_startblock + got.br_blockcount != irec->br_startblock)
+		return false;
+	if (got.br_state != irec->br_state)
+		return false;
+	return true;
+}
+
+static inline bool
+xchk_bmap_has_next(
+	struct xchk_bmap_info	*info,
+	struct xfs_bmbt_irec	*irec)
+{
+	struct xfs_bmbt_irec	got;
+	struct xfs_ifork	*ifp;
+
+	ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork);
+
+	if (!xfs_iext_peek_next_extent(ifp, &info->icur, &got))
+		return false;
+	if (irec->br_startoff + irec->br_blockcount != got.br_startoff)
+		return false;
+	if (irec->br_startblock + irec->br_blockcount != got.br_startblock)
+		return false;
+	if (got.br_state != irec->br_state)
+		return false;
+	return true;
+}
+
 /* Make sure that we have rmapbt records for this extent. */
 STATIC void
 xchk_bmap_xref_rmap(
@@ -214,6 +257,34 @@ xchk_bmap_xref_rmap(
 	if (rmap.rm_flags & XFS_RMAP_BMBT_BLOCK)
 		xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
 				irec->br_startoff);
+
+	/*
+	 * If the rmap starts before this bmbt record, make sure there's a bmbt
+	 * record for the previous offset that is contiguous with this mapping.
+	 * Skip this for CoW fork extents because the refcount btree (and not
+	 * the inode) is the ondisk owner for those extents.
+	 */
+	if (info->whichfork != XFS_COW_FORK && rmap.rm_startblock < agbno &&
+	    !xchk_bmap_has_prev(info, irec)) {
+		xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
+				irec->br_startoff);
+		return;
+	}
+
+	/*
+	 * If the rmap ends after this bmbt record, make sure there's a bmbt
+	 * record for the next offset that is contiguous with this mapping.
+	 * Skip this for CoW fork extents because the refcount btree (and not
+	 * the inode) is the ondisk owner for those extents.
+	 */
+	rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount;
+	if (info->whichfork != XFS_COW_FORK &&
+	    rmap_end > agbno + irec->br_blockcount &&
+	    !xchk_bmap_has_next(info, irec)) {
+		xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
+				irec->br_startoff);
+		return;
+	}
 }
 
 /* Cross-reference a single rtdev extent record. */
@@ -626,7 +697,6 @@ xchk_bmap(
 	struct xfs_inode	*ip = sc->ip;
 	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	xfs_fileoff_t		endoff;
-	struct xfs_iext_cursor	icur;
 	int			error = 0;
 
 	/* Non-existent forks can be ignored. */
@@ -690,7 +760,7 @@ xchk_bmap(
 	/* Scrub extent records. */
 	info.lastoff = 0;
 	ifp = xfs_ifork_ptr(ip, whichfork);
-	for_each_xfs_iext(ifp, &icur, &irec) {
+	for_each_xfs_iext(ifp, &info.icur, &irec) {
 		if (xchk_should_terminate(sc, &error) ||
 		    (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
 			goto out;
-- 
cgit 


From 830ffa09fb130d31f111848a75b65506fdac623d Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Sun, 6 Nov 2022 17:03:20 -0800
Subject: xfs: block map scrub should handle incore delalloc reservations

Enhance the block map scrubber to check delayed allocation reservations.
Though there are no physical space allocations to check, we do need to
make sure that the range of file offsets being mapped are correct, and
to bump the lastoff cursor so that key order checking works correctly.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/scrub/bmap.c | 55 +++++++++++++++++++++++++++++++++++------------------
 1 file changed, 36 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index a4a156dcaa8a..fa8f22ed057a 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -368,14 +368,13 @@ xchk_bmap_dirattr_extent(
 }
 
 /* Scrub a single extent record. */
-STATIC int
+STATIC void
 xchk_bmap_iextent(
 	struct xfs_inode	*ip,
 	struct xchk_bmap_info	*info,
 	struct xfs_bmbt_irec	*irec)
 {
 	struct xfs_mount	*mp = info->sc->mp;
-	int			error = 0;
 
 	/*
 	 * Check for out-of-order extents.  This record could have come
@@ -396,14 +395,6 @@ xchk_bmap_iextent(
 		xchk_fblock_set_corrupt(info->sc, info->whichfork,
 				irec->br_startoff);
 
-	/*
-	 * Check for delalloc extents.  We never iterate the ones in the
-	 * in-core extent scan, and we should never see these in the bmbt.
-	 */
-	if (isnullstartblock(irec->br_startblock))
-		xchk_fblock_set_corrupt(info->sc, info->whichfork,
-				irec->br_startoff);
-
 	/* Make sure the extent points to a valid place. */
 	if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN)
 		xchk_fblock_set_corrupt(info->sc, info->whichfork,
@@ -424,15 +415,12 @@ xchk_bmap_iextent(
 				irec->br_startoff);
 
 	if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
-		return 0;
+		return;
 
 	if (info->is_rt)
 		xchk_bmap_rt_iextent_xref(ip, info, irec);
 	else
 		xchk_bmap_iextent_xref(ip, info, irec);
-
-	info->lastoff = irec->br_startoff + irec->br_blockcount;
-	return error;
 }
 
 /* Scrub a bmbt record. */
@@ -680,6 +668,33 @@ xchk_bmap_check_rmaps(
 	return 0;
 }
 
+/* Scrub a delalloc reservation from the incore extent map tree. */
+STATIC void
+xchk_bmap_iextent_delalloc(
+	struct xfs_inode	*ip,
+	struct xchk_bmap_info	*info,
+	struct xfs_bmbt_irec	*irec)
+{
+	struct xfs_mount	*mp = info->sc->mp;
+
+	/*
+	 * Check for out-of-order extents.  This record could have come
+	 * from the incore list, for which there is no ordering check.
+	 */
+	if (irec->br_startoff < info->lastoff)
+		xchk_fblock_set_corrupt(info->sc, info->whichfork,
+				irec->br_startoff);
+
+	if (!xfs_verify_fileext(mp, irec->br_startoff, irec->br_blockcount))
+		xchk_fblock_set_corrupt(info->sc, info->whichfork,
+				irec->br_startoff);
+
+	/* Make sure the extent points to a valid place. */
+	if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN)
+		xchk_fblock_set_corrupt(info->sc, info->whichfork,
+				irec->br_startoff);
+}
+
 /*
  * Scrub an inode fork's block mappings.
  *
@@ -764,16 +779,18 @@ xchk_bmap(
 		if (xchk_should_terminate(sc, &error) ||
 		    (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
 			goto out;
-		if (isnullstartblock(irec.br_startblock))
-			continue;
+
 		if (irec.br_startoff >= endoff) {
 			xchk_fblock_set_corrupt(sc, whichfork,
 					irec.br_startoff);
 			goto out;
 		}
-		error = xchk_bmap_iextent(ip, &info, &irec);
-		if (error)
-			goto out;
+
+		if (isnullstartblock(irec.br_startblock))
+			xchk_bmap_iextent_delalloc(ip, &info, &irec);
+		else
+			xchk_bmap_iextent(ip, &info, &irec);
+		info.lastoff = irec.br_startoff + irec.br_blockcount;
 	}
 
 	error = xchk_bmap_check_rmaps(sc, whichfork);
-- 
cgit 


From f23c40443d1c2af87c99c1424f9e43bbd7307f92 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Sun, 6 Nov 2022 17:03:21 -0800
Subject: xfs: check quota files for unwritten extents

Teach scrub to flag quota files containing unwritten extents.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/scrub/quota.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index 0b643ff32b22..9eeac8565394 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -14,6 +14,7 @@
 #include "xfs_inode.h"
 #include "xfs_quota.h"
 #include "xfs_qm.h"
+#include "xfs_bmap.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 
@@ -189,11 +190,12 @@ xchk_quota_data_fork(
 	for_each_xfs_iext(ifp, &icur, &irec) {
 		if (xchk_should_terminate(sc, &error))
 			break;
+
 		/*
-		 * delalloc extents or blocks mapped above the highest
+		 * delalloc/unwritten extents or blocks mapped above the highest
 		 * quota id shouldn't happen.
 		 */
-		if (isnullstartblock(irec.br_startblock) ||
+		if (!xfs_bmap_is_written_extent(&irec) ||
 		    irec.br_startoff > max_dqid_off ||
 		    irec.br_startoff + irec.br_blockcount - 1 > max_dqid_off) {
 			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK,
-- 
cgit 


From 31785537010a91a0d1dc403e5d049a38a3d4a30b Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Sun, 6 Nov 2022 17:03:21 -0800
Subject: xfs: check that CoW fork extents are not shared

Ensure that extents in an inode's CoW fork are not marked as shared in
the refcount btree.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/scrub/bmap.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index fa8f22ed057a..4bb6672b02ba 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -335,6 +335,8 @@ xchk_bmap_iextent_xref(
 	case XFS_COW_FORK:
 		xchk_xref_is_cow_staging(info->sc, agbno,
 				irec->br_blockcount);
+		xchk_xref_is_not_shared(info->sc, agbno,
+				irec->br_blockcount);
 		break;
 	}
 
-- 
cgit 


From 5eef46358fae1a6018d9f886a3ecd30e843728dd Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Sun, 6 Nov 2022 17:03:22 -0800
Subject: xfs: teach scrub to flag non-extents format cow forks

CoW forks only exist in memory, which means that they can only ever have
an incore extent tree.  Hence they must always be FMT_EXTENTS, so check
this when we're scrubbing them.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/scrub/bmap.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 4bb6672b02ba..d50d0eab196a 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -748,6 +748,8 @@ xchk_bmap(
 	case XFS_DINODE_FMT_DEV:
 	case XFS_DINODE_FMT_LOCAL:
 		/* No mappings to check. */
+		if (whichfork == XFS_COW_FORK)
+			xchk_fblock_set_corrupt(sc, whichfork, 0);
 		goto out;
 	case XFS_DINODE_FMT_EXTENTS:
 		break;
-- 
cgit 


From bd5ab5f9874109586cbae5bc98e1f9ff574627e2 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 16 Nov 2022 16:08:03 -0800
Subject: xfs: don't warn about files that are exactly s_maxbytes long

We can handle files that are exactly s_maxbytes bytes long; we just
can't handle anything larger than that.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/scrub/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 51820b40ab1c..7a2f38e5202c 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -365,7 +365,7 @@ xchk_dinode(
 	 * pagecache can't cache all the blocks in this file due to
 	 * overly large offsets, flag the inode for admin review.
 	 */
-	if (isize >= mp->m_super->s_maxbytes)
+	if (isize > mp->m_super->s_maxbytes)
 		xchk_ino_set_warning(sc, ino);
 
 	/* di_nblocks */
-- 
cgit 


From f36b954a1f1bf06b5746fea7ecf0fa639ac65324 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 16 Nov 2022 16:08:03 -0800
Subject: xfs: check inode core when scrubbing metadata files

Metadata files (e.g. realtime bitmaps and quota files) do not show up in
the bulkstat output, which means that scrub-by-handle does not work;
they can only be checked through a specific scrub type.  Therefore, each
scrub type calls xchk_metadata_inode_forks to check the metadata for
whatever's in the file.

Unfortunately, that function doesn't actually check the inode record
itself.  Refactor the function a bit to make that happen.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/scrub/common.c | 40 ++++++++++++++++++++++++++++++++++------
 1 file changed, 34 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index ad70f29233c3..613260b04a3d 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -781,6 +781,33 @@ xchk_buffer_recheck(
 	trace_xchk_block_error(sc, xfs_buf_daddr(bp), fa);
 }
 
+static inline int
+xchk_metadata_inode_subtype(
+	struct xfs_scrub	*sc,
+	unsigned int		scrub_type)
+{
+	__u32			smtype = sc->sm->sm_type;
+	int			error;
+
+	sc->sm->sm_type = scrub_type;
+
+	switch (scrub_type) {
+	case XFS_SCRUB_TYPE_INODE:
+		error = xchk_inode(sc);
+		break;
+	case XFS_SCRUB_TYPE_BMBTD:
+		error = xchk_bmap_data(sc);
+		break;
+	default:
+		ASSERT(0);
+		error = -EFSCORRUPTED;
+		break;
+	}
+
+	sc->sm->sm_type = smtype;
+	return error;
+}
+
 /*
  * Scrub the attr/data forks of a metadata inode.  The metadata inode must be
  * pointed to by sc->ip and the ILOCK must be held.
@@ -789,13 +816,17 @@ int
 xchk_metadata_inode_forks(
 	struct xfs_scrub	*sc)
 {
-	__u32			smtype;
 	bool			shared;
 	int			error;
 
 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
 		return 0;
 
+	/* Check the inode record. */
+	error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE);
+	if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+		return error;
+
 	/* Metadata inodes don't live on the rt device. */
 	if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) {
 		xchk_ino_set_corrupt(sc, sc->ip->i_ino);
@@ -815,10 +846,7 @@ xchk_metadata_inode_forks(
 	}
 
 	/* Invoke the data fork scrubber. */
-	smtype = sc->sm->sm_type;
-	sc->sm->sm_type = XFS_SCRUB_TYPE_BMBTD;
-	error = xchk_bmap_data(sc);
-	sc->sm->sm_type = smtype;
+	error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD);
 	if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
 		return error;
 
@@ -833,7 +861,7 @@ xchk_metadata_inode_forks(
 			xchk_ino_set_corrupt(sc, sc->ip->i_ino);
 	}
 
-	return error;
+	return 0;
 }
 
 /*
-- 
cgit 


From 2653d53345bda90604f673bb211dd060a5a5c232 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 16 Nov 2022 19:20:20 -0800
Subject: xfs: fix incorrect error-out in xfs_remove

Clean up resources if resetting the dotdot entry doesn't succeed.
Observed through code inspection.

Fixes: 5838d0356bb3 ("xfs: reset child dir '..' entry when unlinking child")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Andrey Albershteyn <aalbersh@redhat.com>
---
 fs/xfs/xfs_inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index aa303be11576..d354ea2b74f9 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2479,7 +2479,7 @@ xfs_remove(
 			error = xfs_dir_replace(tp, ip, &xfs_name_dotdot,
 					tp->t_mountp->m_sb.sb_rootino, 0);
 			if (error)
-				return error;
+				goto out_trans_cancel;
 		}
 	} else {
 		/*
-- 
cgit 


From 59f6ab40fd8735c9a1a15401610a31cc06a0bbd6 Mon Sep 17 00:00:00 2001
From: Long Li <leo.lilong@huawei.com>
Date: Wed, 16 Nov 2022 19:20:20 -0800
Subject: xfs: fix sb write verify for lazysbcount

When lazysbcount is enabled, fsstress and loop mount/unmount test report
the following problems:

XFS (loop0): SB summary counter sanity check failed
XFS (loop0): Metadata corruption detected at xfs_sb_write_verify+0x13b/0x460,
	xfs_sb block 0x0
XFS (loop0): Unmount and run xfs_repair
XFS (loop0): First 128 bytes of corrupted metadata buffer:
00000000: 58 46 53 42 00 00 10 00 00 00 00 00 00 28 00 00  XFSB.........(..
00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
00000020: 69 fb 7c cd 5f dc 44 af 85 74 e0 cc d4 e3 34 5a  i.|._.D..t....4Z
00000030: 00 00 00 00 00 20 00 06 00 00 00 00 00 00 00 80  ..... ..........
00000040: 00 00 00 00 00 00 00 81 00 00 00 00 00 00 00 82  ................
00000050: 00 00 00 01 00 0a 00 00 00 00 00 04 00 00 00 00  ................
00000060: 00 00 0a 00 b4 b5 02 00 02 00 00 08 00 00 00 00  ................
00000070: 00 00 00 00 00 00 00 00 0c 09 09 03 14 00 00 19  ................
XFS (loop0): Corruption of in-memory data (0x8) detected at _xfs_buf_ioapply
	+0xe1e/0x10e0 (fs/xfs/xfs_buf.c:1580).  Shutting down filesystem.
XFS (loop0): Please unmount the filesystem and rectify the problem(s)
XFS (loop0): log mount/recovery failed: error -117
XFS (loop0): log mount failed

This corruption will shutdown the file system and the file system will
no longer be mountable. The following script can reproduce the problem,
but it may take a long time.

 #!/bin/bash

 device=/dev/sda
 testdir=/mnt/test
 round=0

 function fail()
 {
	 echo "$*"
	 exit 1
 }

 mkdir -p $testdir
 while [ $round -lt 10000 ]
 do
	 echo "******* round $round ********"
	 mkfs.xfs -f $device
	 mount $device $testdir || fail "mount failed!"
	 fsstress -d $testdir -l 0 -n 10000 -p 4 >/dev/null &
	 sleep 4
	 killall -w fsstress
	 umount $testdir
	 xfs_repair -e $device > /dev/null
	 if [ $? -eq 2 ];then
		 echo "ERR CODE 2: Dirty log exception during repair."
		 exit 1
	 fi
	 round=$(($round+1))
 done

With lazysbcount is enabled, There is no additional lock protection for
reading m_ifree and m_icount in xfs_log_sb(), if other cpu modifies the
m_ifree, this will make the m_ifree greater than m_icount. For example,
consider the following sequence and ifreedelta is postive:

 CPU0				 CPU1
 xfs_log_sb			 xfs_trans_unreserve_and_mod_sb
 ----------			 ------------------------------
 percpu_counter_sum(&mp->m_icount)
				 percpu_counter_add_batch(&mp->m_icount,
						idelta, XFS_ICOUNT_BATCH)
				 percpu_counter_add(&mp->m_ifree, ifreedelta);
 percpu_counter_sum(&mp->m_ifree)

After this, incorrect inode count (sb_ifree > sb_icount) will be writen to
the log. In the subsequent writing of sb, incorrect inode count (sb_ifree >
sb_icount) will fail to pass the boundary check in xfs_validate_sb_write()
that cause the file system shutdown.

When lazysbcount is enabled, we don't need to guarantee that Lazy sb
counters are completely correct, but we do need to guarantee that sb_ifree
<= sb_icount. On the other hand, the constraint that m_ifree <= m_icount
must be satisfied any time that there /cannot/ be other threads allocating
or freeing inode chunks. If the constraint is violated under these
circumstances, sb_i{count,free} (the ondisk superblock inode counters)
maybe incorrect and need to be marked sick at unmount, the count will
be rebuilt on the next mount.

Fixes: 8756a5af1819 ("libxfs: add more bounds checking to sb sanity checks")
Signed-off-by: Long Li <leo.lilong@huawei.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_sb.c |  4 +++-
 fs/xfs/xfs_mount.c     | 15 +++++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index a20cade590e9..1eeecf2eb2a7 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -972,7 +972,9 @@ xfs_log_sb(
 	 */
 	if (xfs_has_lazysbcount(mp)) {
 		mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
-		mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree);
+		mp->m_sb.sb_ifree = min_t(uint64_t,
+				percpu_counter_sum(&mp->m_ifree),
+				mp->m_sb.sb_icount);
 		mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks);
 	}
 
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index e8bb3c2e847e..fb87ffb48f7f 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -538,6 +538,20 @@ xfs_check_summary_counts(
 	return 0;
 }
 
+static void
+xfs_unmount_check(
+	struct xfs_mount	*mp)
+{
+	if (xfs_is_shutdown(mp))
+		return;
+
+	if (percpu_counter_sum(&mp->m_ifree) >
+			percpu_counter_sum(&mp->m_icount)) {
+		xfs_alert(mp, "ifree/icount mismatch at unmount");
+		xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
+	}
+}
+
 /*
  * Flush and reclaim dirty inodes in preparation for unmount. Inodes and
  * internal inode structures can be sitting in the CIL and AIL at this point,
@@ -1077,6 +1091,7 @@ xfs_unmountfs(
 	if (error)
 		xfs_warn(mp, "Unable to free reserved block pool. "
 				"Freespace may not be correct on next mount.");
+	xfs_unmount_check(mp);
 
 	xfs_log_unmount(mp);
 	xfs_da_unmount(mp);
-- 
cgit 


From 64c80dfd04d1dd2ecf550542c8f3f41b54b20207 Mon Sep 17 00:00:00 2001
From: Lukas Herbolt <lukas@herbolt.com>
Date: Wed, 16 Nov 2022 19:20:21 -0800
Subject: xfs: Print XFS UUID on mount and umount events.

As of now only device names are printed out over __xfs_printk().
The device names are not persistent across reboots which in case
of searching for origin of corruption brings another task to properly
identify the devices. This patch add XFS UUID upon every mount/umount
event which will make the identification much easier.

Signed-off-by: Lukas Herbolt <lukas@herbolt.com>
[sandeen: rebase onto current upstream kernel]
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_log.c   | 10 ++++++----
 fs/xfs/xfs_super.c |  2 +-
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index f02a0dd522b3..0141d9907d31 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -644,12 +644,14 @@ xfs_log_mount(
 	int		min_logfsbs;
 
 	if (!xfs_has_norecovery(mp)) {
-		xfs_notice(mp, "Mounting V%d Filesystem",
-			   XFS_SB_VERSION_NUM(&mp->m_sb));
+		xfs_notice(mp, "Mounting V%d Filesystem %pU",
+			   XFS_SB_VERSION_NUM(&mp->m_sb),
+			   &mp->m_sb.sb_uuid);
 	} else {
 		xfs_notice(mp,
-"Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.",
-			   XFS_SB_VERSION_NUM(&mp->m_sb));
+"Mounting V%d filesystem %pU in no-recovery mode. Filesystem will be inconsistent.",
+			   XFS_SB_VERSION_NUM(&mp->m_sb),
+			   &mp->m_sb.sb_uuid);
 		ASSERT(xfs_is_readonly(mp));
 	}
 
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index ee4b429a2f2c..0c4b73e9b29d 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1110,7 +1110,7 @@ xfs_fs_put_super(
 	if (!sb->s_fs_info)
 		return;
 
-	xfs_notice(mp, "Unmounting Filesystem");
+	xfs_notice(mp, "Unmounting Filesystem %pU", &mp->m_sb.sb_uuid);
 	xfs_filestream_unmount(mp);
 	xfs_unmountfs(mp);
 
-- 
cgit 


From 7e8e5cc818bd93ee7f2699676f2e5b30d26d83f8 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 11 Nov 2022 06:46:52 -0500
Subject: filelock: WARN_ON_ONCE when ->fl_file and filp don't match

vfs_lock_file, vfs_test_lock and vfs_cancel_lock all take both a struct
file argument and a file_lock. The file_lock has a fl_file field in it
howevever and it _must_ match the file passed in.

While most of the locks.c routines use the separately-passed file
argument, some filesystems rely on fl_file being filled out correctly.

I'm working on a patch series to remove the redundant argument from
these routines, but for now, let's ensure that the callers always set
this properly by issuing a WARN_ON_ONCE if they ever don't match.

Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: Trond Myklebust <trondmy@hammerspace.com>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/locks.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index 607f94a0e789..5876c8ff0edc 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2146,6 +2146,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
  */
 int vfs_test_lock(struct file *filp, struct file_lock *fl)
 {
+	WARN_ON_ONCE(filp != fl->fl_file);
 	if (filp->f_op->lock)
 		return filp->f_op->lock(filp, F_GETLK, fl);
 	posix_test_lock(filp, fl);
@@ -2295,6 +2296,7 @@ out:
  */
 int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf)
 {
+	WARN_ON_ONCE(filp != fl->fl_file);
 	if (filp->f_op->lock)
 		return filp->f_op->lock(filp, cmd, fl);
 	else
@@ -2663,6 +2665,7 @@ void locks_remove_file(struct file *filp)
  */
 int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
 {
+	WARN_ON_ONCE(filp != fl->fl_file);
 	if (filp->f_op->lock)
 		return filp->f_op->lock(filp, F_CANCELLK, fl);
 	return 0;
-- 
cgit 


From dc64cc12bcd14219afb91b55d23192c3eb45aa43 Mon Sep 17 00:00:00 2001
From: Bo Liu <liubo03@inspur.com>
Date: Mon, 14 Nov 2022 22:17:57 -0500
Subject: binfmt_elf: replace IS_ERR() with IS_ERR_VALUE()

Avoid typecasts that are needed for IS_ERR() and use IS_ERR_VALUE()
instead.

Signed-off-by: Bo Liu <liubo03@inspur.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20221115031757.2426-1-liubo03@inspur.com
---
 fs/binfmt_elf.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 528e2ac8931f..d9af34f816a9 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1160,7 +1160,7 @@ out_free_interp:
 		error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
 				elf_prot, elf_flags, total_size);
 		if (BAD_ADDR(error)) {
-			retval = IS_ERR((void *)error) ?
+			retval = IS_ERR_VALUE(error) ?
 				PTR_ERR((void*)error) : -EINVAL;
 			goto out_free_dentry;
 		}
@@ -1245,7 +1245,7 @@ out_free_interp:
 					    interpreter,
 					    load_bias, interp_elf_phdata,
 					    &arch_state);
-		if (!IS_ERR((void *)elf_entry)) {
+		if (!IS_ERR_VALUE(elf_entry)) {
 			/*
 			 * load_elf_interp() returns relocation
 			 * adjustment
@@ -1254,7 +1254,7 @@ out_free_interp:
 			elf_entry += interp_elf_ex->e_entry;
 		}
 		if (BAD_ADDR(elf_entry)) {
-			retval = IS_ERR((void *)elf_entry) ?
+			retval = IS_ERR_VALUE(elf_entry) ?
 					(int)elf_entry : -EINVAL;
 			goto out_free_dentry;
 		}
-- 
cgit 


From cd57e443831d8eeb083c7165bce195d886e216d4 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 17 Nov 2022 16:31:55 -0800
Subject: exec: Remove FOLL_FORCE for stack setup

It does not appear that FOLL_FORCE should be needed for setting up the
stack pages. They are allocated using the nascent brpm->vma, which was
newly created with VM_STACK_FLAGS, which an arch can override, but they
all appear to include VM_WRITE | VM_MAYWRITE. Remove FOLL_FORCE.

Cc: Eric Biederman <ebiederm@xmission.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-mm@kvack.org
Link: https://lore.kernel.org/lkml/202211171439.CDE720EAD@keescook/
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 fs/exec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 9585bc1bc970..870a707b5d3b 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -200,7 +200,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 {
 	struct page *page;
 	int ret;
-	unsigned int gup_flags = FOLL_FORCE;
+	unsigned int gup_flags = 0;
 
 #ifdef CONFIG_STACK_GROWSUP
 	if (write) {
-- 
cgit 


From 8032bf1233a74627ce69b803608e650f3f35971c Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Sun, 9 Oct 2022 20:44:02 -0600
Subject: treewide: use get_random_u32_below() instead of deprecated function

This is a simple mechanical transformation done by:

@@
expression E;
@@
- prandom_u32_max
+ get_random_u32_below
  (E)

Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Darrick J. Wong <djwong@kernel.org> # for xfs
Reviewed-by: SeongJae Park <sj@kernel.org> # for damon
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> # for infiniband
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk> # for arm
Acked-by: Ulf Hansson <ulf.hansson@linaro.org> # for mmc
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 fs/ceph/inode.c            |  2 +-
 fs/ceph/mdsmap.c           |  2 +-
 fs/ext2/ialloc.c           |  2 +-
 fs/ext4/ialloc.c           |  2 +-
 fs/ext4/super.c            |  5 ++---
 fs/f2fs/gc.c               |  2 +-
 fs/f2fs/segment.c          |  8 ++++----
 fs/ubifs/debug.c           |  8 ++++----
 fs/ubifs/lpt_commit.c      | 14 +++++++-------
 fs/ubifs/tnc_commit.c      |  2 +-
 fs/xfs/libxfs/xfs_alloc.c  |  2 +-
 fs/xfs/libxfs/xfs_ialloc.c |  2 +-
 fs/xfs/xfs_error.c         |  2 +-
 13 files changed, 26 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 4af5e55abc15..fb255988dee8 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -362,7 +362,7 @@ static int ceph_fill_fragtree(struct inode *inode,
 	if (nsplits != ci->i_fragtree_nsplits) {
 		update = true;
 	} else if (nsplits) {
-		i = prandom_u32_max(nsplits);
+		i = get_random_u32_below(nsplits);
 		id = le32_to_cpu(fragtree->splits[i].frag);
 		if (!__ceph_find_frag(ci, id))
 			update = true;
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 3fbabc98e1f7..7dac21ee6ce7 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -29,7 +29,7 @@ static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy)
 		return -1;
 
 	/* pick */
-	n = prandom_u32_max(n);
+	n = get_random_u32_below(n);
 	for (j = 0, i = 0; i < m->possible_max_rank; i++) {
 		if (CEPH_MDS_IS_READY(i, ignore_laggy))
 			j++;
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index f4944c4dee60..78b8686d9a4a 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -277,7 +277,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
 		int best_ndir = inodes_per_group;
 		int best_group = -1;
 
-		parent_group = prandom_u32_max(ngroups);
+		parent_group = get_random_u32_below(ngroups);
 		for (i = 0; i < ngroups; i++) {
 			group = (parent_group + i) % ngroups;
 			desc = ext2_get_group_desc (sb, group, NULL);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index e9bc46684106..9fc1af8e19a3 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -465,7 +465,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 			ext4fs_dirhash(parent, qstr->name, qstr->len, &hinfo);
 			parent_group = hinfo.hash % ngroups;
 		} else
-			parent_group = prandom_u32_max(ngroups);
+			parent_group = get_random_u32_below(ngroups);
 		for (i = 0; i < ngroups; i++) {
 			g = (parent_group + i) % ngroups;
 			get_orlov_stats(sb, g, flex_size, &stats);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7cdd2138c897..63ef74eb8091 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3778,7 +3778,7 @@ cont_thread:
 			}
 			if (!progress) {
 				elr->lr_next_sched = jiffies +
-					prandom_u32_max(EXT4_DEF_LI_MAX_START_DELAY * HZ);
+					get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ);
 			}
 			if (time_before(elr->lr_next_sched, next_wakeup))
 				next_wakeup = elr->lr_next_sched;
@@ -3925,8 +3925,7 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
 	 * spread the inode table initialization requests
 	 * better.
 	 */
-	elr->lr_next_sched = jiffies + prandom_u32_max(
-				EXT4_DEF_LI_MAX_START_DELAY * HZ);
+	elr->lr_next_sched = jiffies + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ);
 	return elr;
 }
 
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 4546e01b2ee0..536d332d9e2e 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -282,7 +282,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
 
 	/* let's select beginning hot/small space first in no_heap mode*/
 	if (f2fs_need_rand_seg(sbi))
-		p->offset = prandom_u32_max(MAIN_SECS(sbi) * sbi->segs_per_sec);
+		p->offset = get_random_u32_below(MAIN_SECS(sbi) * sbi->segs_per_sec);
 	else if (test_opt(sbi, NOHEAP) &&
 		(type == CURSEG_HOT_DATA || IS_NODESEG(type)))
 		p->offset = 0;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index acf3d3fa4363..334415d946f8 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -2534,7 +2534,7 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type)
 
 	sanity_check_seg_type(sbi, seg_type);
 	if (f2fs_need_rand_seg(sbi))
-		return prandom_u32_max(MAIN_SECS(sbi) * sbi->segs_per_sec);
+		return get_random_u32_below(MAIN_SECS(sbi) * sbi->segs_per_sec);
 
 	/* if segs_per_sec is large than 1, we need to keep original policy. */
 	if (__is_large_section(sbi))
@@ -2588,7 +2588,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
 	curseg->alloc_type = LFS;
 	if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK)
 		curseg->fragment_remained_chunk =
-				prandom_u32_max(sbi->max_fragment_chunk) + 1;
+				get_random_u32_below(sbi->max_fragment_chunk) + 1;
 }
 
 static int __next_free_blkoff(struct f2fs_sb_info *sbi,
@@ -2625,9 +2625,9 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi,
 			/* To allocate block chunks in different sizes, use random number */
 			if (--seg->fragment_remained_chunk <= 0) {
 				seg->fragment_remained_chunk =
-				   prandom_u32_max(sbi->max_fragment_chunk) + 1;
+				   get_random_u32_below(sbi->max_fragment_chunk) + 1;
 				seg->next_blkoff +=
-				   prandom_u32_max(sbi->max_fragment_hole) + 1;
+				   get_random_u32_below(sbi->max_fragment_hole) + 1;
 			}
 		}
 	}
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 3f128b9fdfbb..9c9d3f0e36a4 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2467,7 +2467,7 @@ error_dump:
 
 static inline int chance(unsigned int n, unsigned int out_of)
 {
-	return !!(prandom_u32_max(out_of) + 1 <= n);
+	return !!(get_random_u32_below(out_of) + 1 <= n);
 
 }
 
@@ -2485,13 +2485,13 @@ static int power_cut_emulated(struct ubifs_info *c, int lnum, int write)
 			if (chance(1, 2)) {
 				d->pc_delay = 1;
 				/* Fail within 1 minute */
-				delay = prandom_u32_max(60000);
+				delay = get_random_u32_below(60000);
 				d->pc_timeout = jiffies;
 				d->pc_timeout += msecs_to_jiffies(delay);
 				ubifs_warn(c, "failing after %lums", delay);
 			} else {
 				d->pc_delay = 2;
-				delay = prandom_u32_max(10000);
+				delay = get_random_u32_below(10000);
 				/* Fail within 10000 operations */
 				d->pc_cnt_max = delay;
 				ubifs_warn(c, "failing after %lu calls", delay);
@@ -2571,7 +2571,7 @@ static int corrupt_data(const struct ubifs_info *c, const void *buf,
 	unsigned int from, to, ffs = chance(1, 2);
 	unsigned char *p = (void *)buf;
 
-	from = prandom_u32_max(len);
+	from = get_random_u32_below(len);
 	/* Corruption span max to end of write unit */
 	to = min(len, ALIGN(from + 1, c->max_write_size));
 
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index cfbc31f709f4..c4d079328b92 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -1970,28 +1970,28 @@ static int dbg_populate_lsave(struct ubifs_info *c)
 
 	if (!dbg_is_chk_gen(c))
 		return 0;
-	if (prandom_u32_max(4))
+	if (get_random_u32_below(4))
 		return 0;
 
 	for (i = 0; i < c->lsave_cnt; i++)
 		c->lsave[i] = c->main_first;
 
 	list_for_each_entry(lprops, &c->empty_list, list)
-		c->lsave[prandom_u32_max(c->lsave_cnt)] = lprops->lnum;
+		c->lsave[get_random_u32_below(c->lsave_cnt)] = lprops->lnum;
 	list_for_each_entry(lprops, &c->freeable_list, list)
-		c->lsave[prandom_u32_max(c->lsave_cnt)] = lprops->lnum;
+		c->lsave[get_random_u32_below(c->lsave_cnt)] = lprops->lnum;
 	list_for_each_entry(lprops, &c->frdi_idx_list, list)
-		c->lsave[prandom_u32_max(c->lsave_cnt)] = lprops->lnum;
+		c->lsave[get_random_u32_below(c->lsave_cnt)] = lprops->lnum;
 
 	heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
 	for (i = 0; i < heap->cnt; i++)
-		c->lsave[prandom_u32_max(c->lsave_cnt)] = heap->arr[i]->lnum;
+		c->lsave[get_random_u32_below(c->lsave_cnt)] = heap->arr[i]->lnum;
 	heap = &c->lpt_heap[LPROPS_DIRTY - 1];
 	for (i = 0; i < heap->cnt; i++)
-		c->lsave[prandom_u32_max(c->lsave_cnt)] = heap->arr[i]->lnum;
+		c->lsave[get_random_u32_below(c->lsave_cnt)] = heap->arr[i]->lnum;
 	heap = &c->lpt_heap[LPROPS_FREE - 1];
 	for (i = 0; i < heap->cnt; i++)
-		c->lsave[prandom_u32_max(c->lsave_cnt)] = heap->arr[i]->lnum;
+		c->lsave[get_random_u32_below(c->lsave_cnt)] = heap->arr[i]->lnum;
 
 	return 1;
 }
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 01362ad5f804..a55e04822d16 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -700,7 +700,7 @@ static int alloc_idx_lebs(struct ubifs_info *c, int cnt)
 		c->ilebs[c->ileb_cnt++] = lnum;
 		dbg_cmt("LEB %d", lnum);
 	}
-	if (dbg_is_chk_index(c) && !prandom_u32_max(8))
+	if (dbg_is_chk_index(c) && !get_random_u32_below(8))
 		return -ENOSPC;
 	return 0;
 }
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index de79f5d07f65..989cf341779b 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -1516,7 +1516,7 @@ xfs_alloc_ag_vextent_lastblock(
 
 #ifdef DEBUG
 	/* Randomly don't execute the first algorithm. */
-	if (prandom_u32_max(2))
+	if (get_random_u32_below(2))
 		return 0;
 #endif
 
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 94db50eb706a..5118dedf9267 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -636,7 +636,7 @@ xfs_ialloc_ag_alloc(
 	/* randomly do sparse inode allocations */
 	if (xfs_has_sparseinodes(tp->t_mountp) &&
 	    igeo->ialloc_min_blks < igeo->ialloc_blks)
-		do_sparse = prandom_u32_max(2);
+		do_sparse = get_random_u32_below(2);
 #endif
 
 	/*
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index c6b2aabd6f18..822e6a0e9d1a 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -279,7 +279,7 @@ xfs_errortag_test(
 
 	ASSERT(error_tag < XFS_ERRTAG_MAX);
 	randfactor = mp->m_errortag[error_tag];
-	if (!randfactor || prandom_u32_max(randfactor))
+	if (!randfactor || get_random_u32_below(randfactor))
 		return false;
 
 	xfs_warn_ratelimited(mp,
-- 
cgit 


From d247aabd391c3b2fa4f26874ed9136a7a142fcfd Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Sun, 9 Oct 2022 20:44:02 -0600
Subject: treewide: use get_random_u32_{above,below}() instead of manual loop

These cases were done with this Coccinelle:

@@
expression E;
identifier I;
@@
-   do {
      ... when != I
-     I = get_random_u32();
      ... when != I
-   } while (I > E);
+   I = get_random_u32_below(E + 1);

@@
expression E;
identifier I;
@@
-   do {
      ... when != I
-     I = get_random_u32();
      ... when != I
-   } while (I >= E);
+   I = get_random_u32_below(E);

@@
expression E;
identifier I;
@@
-   do {
      ... when != I
-     I = get_random_u32();
      ... when != I
-   } while (I < E);
+   I = get_random_u32_above(E - 1);

@@
expression E;
identifier I;
@@
-   do {
      ... when != I
-     I = get_random_u32();
      ... when != I
-   } while (I <= E);
+   I = get_random_u32_above(E);

@@
identifier I;
@@
-   do {
      ... when != I
-     I = get_random_u32();
      ... when != I
-   } while (!I);
+   I = get_random_u32_above(0);

@@
identifier I;
@@
-   do {
      ... when != I
-     I = get_random_u32();
      ... when != I
-   } while (I == 0);
+   I = get_random_u32_above(0);

@@
expression E;
@@
- E + 1 + get_random_u32_below(U32_MAX - E)
+ get_random_u32_above(E)

Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 fs/ext4/mmp.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 588cb09c5291..4681fff6665f 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -262,13 +262,7 @@ void ext4_stop_mmpd(struct ext4_sb_info *sbi)
  */
 static unsigned int mmp_new_seq(void)
 {
-	u32 new_seq;
-
-	do {
-		new_seq = get_random_u32();
-	} while (new_seq > EXT4_MMP_SEQ_MAX);
-
-	return new_seq;
+	return get_random_u32_below(EXT4_MMP_SEQ_MAX + 1);
 }
 
 /*
-- 
cgit 


From e8a533cbeb79809206f8724e89961e0079508c3c Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Sun, 9 Oct 2022 20:44:02 -0600
Subject: treewide: use get_random_u32_inclusive() when possible

These cases were done with this Coccinelle:

@@
expression H;
expression L;
@@
- (get_random_u32_below(H) + L)
+ get_random_u32_inclusive(L, H + L - 1)

@@
expression H;
expression L;
expression E;
@@
  get_random_u32_inclusive(L,
  H
- + E
- - E
  )

@@
expression H;
expression L;
expression E;
@@
  get_random_u32_inclusive(L,
  H
- - E
- + E
  )

@@
expression H;
expression L;
expression E;
expression F;
@@
  get_random_u32_inclusive(L,
  H
- - E
  + F
- + E
  )

@@
expression H;
expression L;
expression E;
expression F;
@@
  get_random_u32_inclusive(L,
  H
- + E
  + F
- - E
  )

And then subsequently cleaned up by hand, with several automatic cases
rejected if it didn't make sense contextually.

Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> # for infiniband
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 fs/f2fs/segment.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 334415d946f8..b304692c0cf5 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -2588,7 +2588,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
 	curseg->alloc_type = LFS;
 	if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK)
 		curseg->fragment_remained_chunk =
-				get_random_u32_below(sbi->max_fragment_chunk) + 1;
+				get_random_u32_inclusive(1, sbi->max_fragment_chunk);
 }
 
 static int __next_free_blkoff(struct f2fs_sb_info *sbi,
@@ -2625,9 +2625,9 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi,
 			/* To allocate block chunks in different sizes, use random number */
 			if (--seg->fragment_remained_chunk <= 0) {
 				seg->fragment_remained_chunk =
-				   get_random_u32_below(sbi->max_fragment_chunk) + 1;
+				   get_random_u32_inclusive(1, sbi->max_fragment_chunk);
 				seg->next_blkoff +=
-				   get_random_u32_below(sbi->max_fragment_hole) + 1;
+				   get_random_u32_inclusive(1, sbi->max_fragment_hole);
 			}
 		}
 	}
-- 
cgit 


From 8603b6f58637ce196d68f7749633ea81af196d66 Mon Sep 17 00:00:00 2001
From: Oleksandr Natalenko <oleksandr@redhat.com>
Date: Sat, 3 Sep 2022 08:43:30 +0200
Subject: core_pattern: add CPU specifier
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Statistically, in a large deployment regular segfaults may indicate a CPU
issue.

Currently, it is not possible to find out what CPU the segfault happened
on.  There are at least two attempts to improve segfault logging with this
regard, but they do not help in case the logs rotate.

Hence, lets make sure it is possible to permanently record a CPU the task
ran on using a new core_pattern specifier.

Link: https://lkml.kernel.org/r/20220903064330.20772-1-oleksandr@redhat.com
Signed-off-by: Oleksandr Natalenko <oleksandr@redhat.com>
Suggested-by: Renaud Métrich <rmetrich@redhat.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: "Eric W . Biederman" <ebiederm@xmission.com>
Cc: Grzegorz Halat <ghalat@redhat.com>
Cc: "Guilherme G. Piccoli" <gpiccoli@igalia.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Joel Savitz <jsavitz@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kees Cook <keescook@chromium.org>
Cc: Laurent Dufour <ldufour@linux.ibm.com>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Rob Herring <robh@kernel.org>
Cc: Stephen Kitt <steve@sk2.org>
Cc: Will Deacon <will@kernel.org>
Cc: Xiaoming Ni <nixiaoming@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/coredump.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/coredump.c b/fs/coredump.c
index 7bad7785e8e6..3e8630c8d627 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -325,6 +325,10 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm,
 				err = cn_printf(cn, "%lu",
 					      rlimit(RLIMIT_CORE));
 				break;
+			/* CPU the task ran on */
+			case 'C':
+				err = cn_printf(cn, "%d", cprm->cpu);
+				break;
 			default:
 				break;
 			}
@@ -534,6 +538,7 @@ void do_coredump(const kernel_siginfo_t *siginfo)
 		 */
 		.mm_flags = mm->flags,
 		.vma_meta = NULL,
+		.cpu = raw_smp_processor_id(),
 	};
 
 	audit_core_dumps(siginfo->si_signo);
-- 
cgit 


From 71dd5d651be7c99c401ad642e9f3cba88b956bce Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@linux.alibaba.com>
Date: Fri, 7 Oct 2022 20:48:44 +0800
Subject: ocfs2/cluster: use bitmap API instead of hand-writing it

Use bitmap_zero/bitmap_copy/bitmap_equal directly for bitmap operations.

Link: https://lkml.kernel.org/r/20221007124846.186453-1-joseph.qi@linux.alibaba.com
Signed-off-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Gang He <ghe@suse.com>
Cc: Jun Piao <piaojun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/cluster/heartbeat.c   | 20 ++++++++++----------
 fs/ocfs2/cluster/nodemanager.c |  2 +-
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index b13d344d40b6..8fe6031f60e3 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -335,7 +335,7 @@ static void o2hb_arm_timeout(struct o2hb_region *reg)
 	/* negotiate timeout must be less than write timeout. */
 	schedule_delayed_work(&reg->hr_nego_timeout_work,
 			      msecs_to_jiffies(O2HB_NEGO_TIMEOUT_MS));
-	memset(reg->hr_nego_node_bitmap, 0, sizeof(reg->hr_nego_node_bitmap));
+	bitmap_zero(reg->hr_nego_node_bitmap, O2NM_MAX_NODES);
 }
 
 static void o2hb_disarm_timeout(struct o2hb_region *reg)
@@ -386,8 +386,8 @@ static void o2hb_nego_timeout(struct work_struct *work)
 				config_item_name(&reg->hr_item), reg->hr_bdev);
 			set_bit(master_node, reg->hr_nego_node_bitmap);
 		}
-		if (memcmp(reg->hr_nego_node_bitmap, live_node_bitmap,
-				sizeof(reg->hr_nego_node_bitmap))) {
+		if (!bitmap_equal(reg->hr_nego_node_bitmap, live_node_bitmap,
+				  O2NM_MAX_NODES)) {
 			/* check negotiate bitmap every second to do timeout
 			 * approve decision.
 			 */
@@ -856,8 +856,8 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg)
 	 * live nodes heartbeat on it. In other words, the region has been
 	 * added to all nodes.
 	 */
-	if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
-		   sizeof(o2hb_live_node_bitmap)))
+	if (!bitmap_equal(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
+			  O2NM_MAX_NODES))
 		goto unlock;
 
 	printk(KERN_NOTICE "o2hb: Region %s (%pg) is now a quorum device\n",
@@ -1437,11 +1437,11 @@ void o2hb_init(void)
 	for (i = 0; i < ARRAY_SIZE(o2hb_live_slots); i++)
 		INIT_LIST_HEAD(&o2hb_live_slots[i]);
 
-	memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
-	memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap));
-	memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap));
-	memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap));
-	memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap));
+	bitmap_zero(o2hb_live_node_bitmap, O2NM_MAX_NODES);
+	bitmap_zero(o2hb_region_bitmap, O2NM_MAX_REGIONS);
+	bitmap_zero(o2hb_live_region_bitmap, O2NM_MAX_REGIONS);
+	bitmap_zero(o2hb_quorum_region_bitmap, O2NM_MAX_REGIONS);
+	bitmap_zero(o2hb_failed_region_bitmap, O2NM_MAX_REGIONS);
 
 	o2hb_dependent_users = 0;
 
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 27fee68f860a..2f61d39e4e50 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -54,7 +54,7 @@ int o2nm_configured_node_map(unsigned long *map, unsigned bytes)
 		return -EINVAL;
 
 	read_lock(&cluster->cl_nodes_lock);
-	memcpy(map, cluster->cl_nodes_bitmap, sizeof(cluster->cl_nodes_bitmap));
+	bitmap_copy(map, cluster->cl_nodes_bitmap, O2NM_MAX_NODES);
 	read_unlock(&cluster->cl_nodes_lock);
 
 	return 0;
-- 
cgit 


From 6d4a93b6809270e5c7d7216b20f4ef2e88213eb3 Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@linux.alibaba.com>
Date: Fri, 7 Oct 2022 20:48:45 +0800
Subject: ocfs2: use bitmap API in fill_node_map

Pass bits directly into fill_node_map helper and use bitmap API directly
to simplify code.

Link: https://lkml.kernel.org/r/20221007124846.186453-2-joseph.qi@linux.alibaba.com
Signed-off-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Gang He <ghe@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mark@fasheh.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/cluster/heartbeat.c | 18 ++++++++----------
 fs/ocfs2/cluster/heartbeat.h |  2 +-
 fs/ocfs2/cluster/netdebug.c  |  2 +-
 fs/ocfs2/cluster/tcp.c       |  6 ++----
 fs/ocfs2/dlm/dlmdomain.c     |  2 +-
 fs/ocfs2/stack_o2cb.c        |  6 +++---
 6 files changed, 16 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 8fe6031f60e3..60b97c92e2b2 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -375,7 +375,7 @@ static void o2hb_nego_timeout(struct work_struct *work)
 	if (reg->hr_last_hb_status)
 		return;
 
-	o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap));
+	o2hb_fill_node_map(live_node_bitmap, O2NM_MAX_NODES);
 	/* lowest node as master node to make negotiate decision. */
 	master_node = find_first_bit(live_node_bitmap, O2NM_MAX_NODES);
 
@@ -1087,7 +1087,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
 	 * If a node is not configured but is in the livemap, we still need
 	 * to read the slot so as to be able to remove it from the livemap.
 	 */
-	o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap));
+	o2hb_fill_node_map(live_node_bitmap, O2NM_MAX_NODES);
 	i = -1;
 	while ((i = find_next_bit(live_node_bitmap,
 				  O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
@@ -1450,23 +1450,21 @@ void o2hb_init(void)
 
 /* if we're already in a callback then we're already serialized by the sem */
 static void o2hb_fill_node_map_from_callback(unsigned long *map,
-					     unsigned bytes)
+					     unsigned int bits)
 {
-	BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long)));
-
-	memcpy(map, &o2hb_live_node_bitmap, bytes);
+	bitmap_copy(map, o2hb_live_node_bitmap, bits);
 }
 
 /*
  * get a map of all nodes that are heartbeating in any regions
  */
-void o2hb_fill_node_map(unsigned long *map, unsigned bytes)
+void o2hb_fill_node_map(unsigned long *map, unsigned int bits)
 {
 	/* callers want to serialize this map and callbacks so that they
 	 * can trust that they don't miss nodes coming to the party */
 	down_read(&o2hb_callback_sem);
 	spin_lock(&o2hb_live_lock);
-	o2hb_fill_node_map_from_callback(map, bytes);
+	o2hb_fill_node_map_from_callback(map, bits);
 	spin_unlock(&o2hb_live_lock);
 	up_read(&o2hb_callback_sem);
 }
@@ -2460,7 +2458,7 @@ int o2hb_check_node_heartbeating_no_sem(u8 node_num)
 	unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
 
 	spin_lock(&o2hb_live_lock);
-	o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
+	o2hb_fill_node_map_from_callback(testing_map, O2NM_MAX_NODES);
 	spin_unlock(&o2hb_live_lock);
 	if (!test_bit(node_num, testing_map)) {
 		mlog(ML_HEARTBEAT,
@@ -2477,7 +2475,7 @@ int o2hb_check_node_heartbeating_from_callback(u8 node_num)
 {
 	unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
 
-	o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
+	o2hb_fill_node_map_from_callback(testing_map, O2NM_MAX_NODES);
 	if (!test_bit(node_num, testing_map)) {
 		mlog(ML_HEARTBEAT,
 		     "node (%u) does not have heartbeating enabled.\n",
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 1d4100abf6f8..8ef8c1b9eeb7 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -59,7 +59,7 @@ int o2hb_register_callback(const char *region_uuid,
 void o2hb_unregister_callback(const char *region_uuid,
 			      struct o2hb_callback_func *hc);
 void o2hb_fill_node_map(unsigned long *map,
-			unsigned bytes);
+			unsigned int bits);
 void o2hb_exit(void);
 void o2hb_init(void);
 int o2hb_check_node_heartbeating_no_sem(u8 node_num);
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index 7524994e3199..35c05c18de59 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -438,7 +438,7 @@ static int o2net_fill_bitmap(char *buf, int len)
 	unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
 	int i = -1, out = 0;
 
-	o2net_fill_node_map(map, sizeof(map));
+	o2net_fill_node_map(map, O2NM_MAX_NODES);
 
 	while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
 		out += scnprintf(buf + out, PAGE_SIZE - out, "%d ", i);
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index f660c0dbdb63..6f5a3fb97c7f 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -990,14 +990,12 @@ static int o2net_tx_can_proceed(struct o2net_node *nn,
 }
 
 /* Get a map of all nodes to which this node is currently connected to */
-void o2net_fill_node_map(unsigned long *map, unsigned bytes)
+void o2net_fill_node_map(unsigned long *map, unsigned int bits)
 {
 	struct o2net_sock_container *sc;
 	int node, ret;
 
-	BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long)));
-
-	memset(map, 0, bytes);
+	bitmap_zero(map, bits);
 	for (node = 0; node < O2NM_MAX_NODES; ++node) {
 		if (!o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret))
 			continue;
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index c4eccd499db8..f46941ff665d 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1604,7 +1604,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
 	/* group sem locking should work for us here -- we're already
 	 * registered for heartbeat events so filling this should be
 	 * atomic wrt getting those handlers called. */
-	o2hb_fill_node_map(dlm->live_nodes_map, sizeof(dlm->live_nodes_map));
+	o2hb_fill_node_map(dlm->live_nodes_map, O2NM_MAX_NODES);
 
 	spin_lock(&dlm->spinlock);
 	memcpy(ctxt->live_map, dlm->live_nodes_map, sizeof(ctxt->live_map));
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 88f75f7f02d7..c973c03f6fd8 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -273,17 +273,17 @@ static int o2cb_cluster_check(void)
 	 */
 #define	O2CB_MAP_STABILIZE_COUNT	60
 	for (i = 0; i < O2CB_MAP_STABILIZE_COUNT; ++i) {
-		o2hb_fill_node_map(hbmap, sizeof(hbmap));
+		o2hb_fill_node_map(hbmap, O2NM_MAX_NODES);
 		if (!test_bit(node_num, hbmap)) {
 			printk(KERN_ERR "o2cb: %s heartbeat has not been "
 			       "started.\n", (o2hb_global_heartbeat_active() ?
 					      "Global" : "Local"));
 			return -EINVAL;
 		}
-		o2net_fill_node_map(netmap, sizeof(netmap));
+		o2net_fill_node_map(netmap, O2NM_MAX_NODES);
 		/* Force set the current node to allow easy compare */
 		set_bit(node_num, netmap);
-		if (!memcmp(hbmap, netmap, sizeof(hbmap)))
+		if (bitmap_equal(hbmap, netmap, O2NM_MAX_NODES))
 			return 0;
 		if (i < O2CB_MAP_STABILIZE_COUNT - 1)
 			msleep(1000);
-- 
cgit 


From b270f492dc45b59d573cd80795e8b4781e959836 Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@linux.alibaba.com>
Date: Fri, 7 Oct 2022 20:48:46 +0800
Subject: ocfs2/dlm: use bitmap API instead of hand-writing it

Use bitmap_zero/bitmap_copy/bitmap_qeual directly for bitmap operations.

Link: https://lkml.kernel.org/r/20221007124846.186453-3-joseph.qi@linux.alibaba.com
Signed-off-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Gang He <ghe@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mark@fasheh.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/dlm/dlmcommon.h   |  2 +-
 fs/ocfs2/dlm/dlmdomain.c   | 17 +++++++----------
 fs/ocfs2/dlm/dlmmaster.c   | 30 +++++++++++++++---------------
 fs/ocfs2/dlm/dlmrecovery.c |  2 +-
 4 files changed, 24 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index fd2022712167..20f790a47484 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -1094,7 +1094,7 @@ static inline enum dlm_status dlm_err_to_dlm_status(int err)
 static inline void dlm_node_iter_init(unsigned long *map,
 				      struct dlm_node_iter *iter)
 {
-	memcpy(iter->node_map, map, sizeof(iter->node_map));
+	bitmap_copy(iter->node_map, map, O2NM_MAX_NODES);
 	iter->curnode = -1;
 }
 
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index f46941ff665d..5c04dde99981 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1576,8 +1576,8 @@ static int dlm_should_restart_join(struct dlm_ctxt *dlm,
 	spin_lock(&dlm->spinlock);
 	/* For now, we restart the process if the node maps have
 	 * changed at all */
-	ret = memcmp(ctxt->live_map, dlm->live_nodes_map,
-		     sizeof(dlm->live_nodes_map));
+	ret = !bitmap_equal(ctxt->live_map, dlm->live_nodes_map,
+			    O2NM_MAX_NODES);
 	spin_unlock(&dlm->spinlock);
 
 	if (ret)
@@ -1607,10 +1607,8 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
 	o2hb_fill_node_map(dlm->live_nodes_map, O2NM_MAX_NODES);
 
 	spin_lock(&dlm->spinlock);
-	memcpy(ctxt->live_map, dlm->live_nodes_map, sizeof(ctxt->live_map));
-
+	bitmap_copy(ctxt->live_map, dlm->live_nodes_map, O2NM_MAX_NODES);
 	__dlm_set_joining_node(dlm, dlm->node_num);
-
 	spin_unlock(&dlm->spinlock);
 
 	node = -1;
@@ -1643,8 +1641,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
 	 * yes_resp_map. Copy that into our domain map and send a join
 	 * assert message to clean up everyone elses state. */
 	spin_lock(&dlm->spinlock);
-	memcpy(dlm->domain_map, ctxt->yes_resp_map,
-	       sizeof(ctxt->yes_resp_map));
+	bitmap_copy(dlm->domain_map, ctxt->yes_resp_map, O2NM_MAX_NODES);
 	set_bit(dlm->node_num, dlm->domain_map);
 	spin_unlock(&dlm->spinlock);
 
@@ -2009,9 +2006,9 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
 	mlog(0, "dlm->recovery_map=%p, &(dlm->recovery_map[0])=%p\n",
 		  dlm->recovery_map, &(dlm->recovery_map[0]));
 
-	memset(dlm->recovery_map, 0, sizeof(dlm->recovery_map));
-	memset(dlm->live_nodes_map, 0, sizeof(dlm->live_nodes_map));
-	memset(dlm->domain_map, 0, sizeof(dlm->domain_map));
+	bitmap_zero(dlm->recovery_map, O2NM_MAX_NODES);
+	bitmap_zero(dlm->live_nodes_map, O2NM_MAX_NODES);
+	bitmap_zero(dlm->domain_map, O2NM_MAX_NODES);
 
 	dlm->dlm_thread_task = NULL;
 	dlm->dlm_reco_thread_task = NULL;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 227da5b1b6ab..d610da8e2f24 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -258,12 +258,12 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
 	mle->type = type;
 	INIT_HLIST_NODE(&mle->master_hash_node);
 	INIT_LIST_HEAD(&mle->hb_events);
-	memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
+	bitmap_zero(mle->maybe_map, O2NM_MAX_NODES);
 	spin_lock_init(&mle->spinlock);
 	init_waitqueue_head(&mle->wq);
 	atomic_set(&mle->woken, 0);
 	kref_init(&mle->mle_refs);
-	memset(mle->response_map, 0, sizeof(mle->response_map));
+	bitmap_zero(mle->response_map, O2NM_MAX_NODES);
 	mle->master = O2NM_MAX_NODES;
 	mle->new_master = O2NM_MAX_NODES;
 	mle->inuse = 0;
@@ -290,8 +290,8 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
 	atomic_inc(&dlm->mle_cur_count[mle->type]);
 
 	/* copy off the node_map and register hb callbacks on our copy */
-	memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));
-	memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));
+	bitmap_copy(mle->node_map, dlm->domain_map, O2NM_MAX_NODES);
+	bitmap_copy(mle->vote_map, dlm->domain_map, O2NM_MAX_NODES);
 	clear_bit(dlm->node_num, mle->vote_map);
 	clear_bit(dlm->node_num, mle->node_map);
 
@@ -572,7 +572,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
 	spin_unlock(&dlm->track_lock);
 
 	memset(res->lvb, 0, DLM_LVB_LEN);
-	memset(res->refmap, 0, sizeof(res->refmap));
+	bitmap_zero(res->refmap, O2NM_MAX_NODES);
 }
 
 struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
@@ -1036,10 +1036,10 @@ recheck:
 
 	spin_lock(&mle->spinlock);
 	m = mle->master;
-	map_changed = (memcmp(mle->vote_map, mle->node_map,
-			      sizeof(mle->vote_map)) != 0);
-	voting_done = (memcmp(mle->vote_map, mle->response_map,
-			     sizeof(mle->vote_map)) == 0);
+	map_changed = !bitmap_equal(mle->vote_map, mle->node_map,
+				    O2NM_MAX_NODES);
+	voting_done = bitmap_equal(mle->vote_map, mle->response_map,
+				   O2NM_MAX_NODES);
 
 	/* restart if we hit any errors */
 	if (map_changed) {
@@ -1277,11 +1277,11 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
 
 			/* now blank out everything, as if we had never
 			 * contacted anyone */
-			memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
-			memset(mle->response_map, 0, sizeof(mle->response_map));
+			bitmap_zero(mle->maybe_map, O2NM_MAX_NODES);
+			bitmap_zero(mle->response_map, O2NM_MAX_NODES);
 			/* reset the vote_map to the current node_map */
-			memcpy(mle->vote_map, mle->node_map,
-			       sizeof(mle->node_map));
+			bitmap_copy(mle->vote_map, mle->node_map,
+				    O2NM_MAX_NODES);
 			/* put myself into the maybe map */
 			if (mle->type != DLM_MLE_BLOCK)
 				set_bit(dlm->node_num, mle->maybe_map);
@@ -2094,7 +2094,7 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
 	flags = item->u.am.flags;
 
 	spin_lock(&dlm->spinlock);
-	memcpy(nodemap, dlm->domain_map, sizeof(nodemap));
+	bitmap_copy(nodemap, dlm->domain_map, O2NM_MAX_NODES);
 	spin_unlock(&dlm->spinlock);
 
 	clear_bit(dlm->node_num, nodemap);
@@ -3447,7 +3447,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
 		ret = 0;
 	}
 
-	memset(iter.node_map, 0, sizeof(iter.node_map));
+	bitmap_zero(iter.node_map, O2NM_MAX_NODES);
 	set_bit(old_master, iter.node_map);
 	mlog(0, "doing assert master of %.*s back to %u\n",
 	     res->lockname.len, res->lockname.name, old_master);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 52ad342fec3e..50da8af988c1 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -733,7 +733,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
 	struct dlm_reco_node_data *ndata;
 
 	spin_lock(&dlm->spinlock);
-	memcpy(dlm->reco.node_map, dlm->domain_map, sizeof(dlm->domain_map));
+	bitmap_copy(dlm->reco.node_map, dlm->domain_map, O2NM_MAX_NODES);
 	/* nodes can only be removed (by dying) after dropping
 	 * this lock, and death will be trapped later, so this should do */
 	spin_unlock(&dlm->spinlock);
-- 
cgit 


From 12b9d301ff73122aebd78548fa4c04ca69ed78fe Mon Sep 17 00:00:00 2001
From: Jianglei Nie <niejianglei2021@163.com>
Date: Thu, 29 Sep 2022 12:29:33 +0800
Subject: proc/vmcore: fix potential memory leak in vmcore_init()

Patch series "Some minor cleanup patches resent".

The first three patches trivial clean up patches.

And for the patch "kexec: replace crash_mem_range with range", I got a
ibm-p9wr ppc64le system to test, it works well.


This patch (of 4):

elfcorehdr_alloc() allocates a memory chunk for elfcorehdr_addr with
kzalloc().  If is_vmcore_usable() returns false, elfcorehdr_addr is a
predefined value.  If parse_crash_elf_headers() gets some error and
returns a negetive value, the elfcorehdr_addr should be released with
elfcorehdr_free().

Fix it by calling elfcorehdr_free() when parse_crash_elf_headers() fails.

Link: https://lkml.kernel.org/r/20220929042936.22012-1-bhe@redhat.com
Link: https://lkml.kernel.org/r/20220929042936.22012-2-bhe@redhat.com
Signed-off-by: Jianglei Nie <niejianglei2021@163.com>
Signed-off-by: Baoquan He <bhe@redhat.com>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Chen Lifu <chenlifu@huawei.com>
Cc: "Eric W . Biederman" <ebiederm@xmission.com>
Cc: Li Chen <lchen@ambarella.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: ye xingchen <ye.xingchen@zte.com.cn>
Cc: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/vmcore.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index f2aa86c421f2..74747571d58e 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -1567,6 +1567,7 @@ static int __init vmcore_init(void)
 		return rc;
 	rc = parse_crash_elf_headers();
 	if (rc) {
+		elfcorehdr_free(elfcorehdr_addr);
 		pr_warn("Kdump: vmcore not initialized\n");
 		return rc;
 	}
-- 
cgit 


From f1f1f2569901ec5b9d425f2e91c09a0e320768f3 Mon Sep 17 00:00:00 2001
From: Ivan Babrou <ivan@cloudflare.com>
Date: Thu, 22 Sep 2022 15:40:26 -0700
Subject: proc: report open files as size in stat() for /proc/pid/fd

Many monitoring tools include open file count as a metric.  Currently the
only way to get this number is to enumerate the files in /proc/pid/fd.

The problem with the current approach is that it does many things people
generally don't care about when they need one number for a metric.  In our
tests for cadvisor, which reports open file counts per cgroup, we observed
that reading the number of open files is slow.  Out of 35.23% of CPU time
spent in `proc_readfd_common`, we see 29.43% spent in `proc_fill_cache`,
which is responsible for filling dentry info.  Some of this extra time is
spinlock contention, but it's a contention for the lock we don't want to
take to begin with.

We considered putting the number of open files in /proc/pid/status.
Unfortunately, counting the number of fds involves iterating the
open_files bitmap, which has a linear complexity in proportion with the
number of open files (bitmap slots really, but it's close).  We don't want
to make /proc/pid/status any slower, so instead we put this info in
/proc/pid/fd as a size member of the stat syscall result.  Previously the
reported number was zero, so there's very little risk of breaking
anything, while still providing a somewhat logical way to count the open
files with a fallback if it's zero.

RFC for this patch included iterating open fds under RCU.  Thanks to Frank
Hofmann for the suggestion to use the bitmap instead.

Previously:

```
$ sudo stat /proc/1/fd | head -n2
  File: /proc/1/fd
  Size: 0         	Blocks: 0          IO Block: 1024   directory
```

With this patch:

```
$ sudo stat /proc/1/fd | head -n2
  File: /proc/1/fd
  Size: 65        	Blocks: 0          IO Block: 1024   directory
```

Correctness check:

```
$ sudo ls /proc/1/fd | wc -l
65
```

I added the docs for /proc/<pid>/fd while I'm at it.

[ivan@cloudflare.com: use bitmap_weight() to count the bits]
  Link: https://lkml.kernel.org/r/20221018045844.37697-1-ivan@cloudflare.com
[akpm@linux-foundation.org: include linux/bitmap.h for bitmap_weight()]
[ivan@cloudflare.com: return errno from proc_fd_getattr() instead of setting negative size]
  Link: https://lkml.kernel.org/r/20221024173140.30673-1-ivan@cloudflare.com
Link: https://lkml.kernel.org/r/20220922224027.59266-1-ivan@cloudflare.com
Signed-off-by: Ivan Babrou <ivan@cloudflare.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Anton Mitterer <mail@christoph.anton.mitterer.name>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Laight <David.Laight@ACULAB.COM>
Cc: Ivan Babrou <ivan@cloudflare.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/fd.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 913bef0d2a36..fc46d6fe080c 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -7,6 +7,7 @@
 #include <linux/namei.h>
 #include <linux/pid.h>
 #include <linux/ptrace.h>
+#include <linux/bitmap.h>
 #include <linux/security.h>
 #include <linux/file.h>
 #include <linux/seq_file.h>
@@ -279,6 +280,30 @@ out:
 	return 0;
 }
 
+static int proc_readfd_count(struct inode *inode, loff_t *count)
+{
+	struct task_struct *p = get_proc_task(inode);
+	struct fdtable *fdt;
+
+	if (!p)
+		return -ENOENT;
+
+	task_lock(p);
+	if (p->files) {
+		rcu_read_lock();
+
+		fdt = files_fdtable(p->files);
+		*count = bitmap_weight(fdt->open_fds, fdt->max_fds);
+
+		rcu_read_unlock();
+	}
+	task_unlock(p);
+
+	put_task_struct(p);
+
+	return 0;
+}
+
 static int proc_readfd(struct file *file, struct dir_context *ctx)
 {
 	return proc_readfd_common(file, ctx, proc_fd_instantiate);
@@ -319,9 +344,29 @@ int proc_fd_permission(struct user_namespace *mnt_userns,
 	return rv;
 }
 
+static int proc_fd_getattr(struct user_namespace *mnt_userns,
+			const struct path *path, struct kstat *stat,
+			u32 request_mask, unsigned int query_flags)
+{
+	struct inode *inode = d_inode(path->dentry);
+	int rv = 0;
+
+	generic_fillattr(&init_user_ns, inode, stat);
+
+	/* If it's a directory, put the number of open fds there */
+	if (S_ISDIR(inode->i_mode)) {
+		rv = proc_readfd_count(inode, &stat->size);
+		if (rv < 0)
+			return rv;
+	}
+
+	return rv;
+}
+
 const struct inode_operations proc_fd_inode_operations = {
 	.lookup		= proc_lookupfd,
 	.permission	= proc_fd_permission,
+	.getattr	= proc_fd_getattr,
 	.setattr	= proc_setattr,
 };
 
-- 
cgit 


From 941baf6febaa88c057192084ca280d976c7c7239 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 8 Sep 2022 21:21:54 +0300
Subject: proc: give /proc/cmdline size

Most /proc files don't have length (in fstat sense).  This leads to
inefficiencies when reading such files with APIs commonly found in modern
programming languages.  They open file, then fstat descriptor, get st_size
== 0 and either assume file is empty or start reading without knowing
target size.

cat(1) does OK because it uses large enough buffer by default.  But naive
programs copy-pasted from SO aren't:

	let mut f = std::fs::File::open("/proc/cmdline").unwrap();
	let mut buf: Vec<u8> = Vec::new();
	f.read_to_end(&mut buf).unwrap();

will result in

	openat(AT_FDCWD, "/proc/cmdline", O_RDONLY|O_CLOEXEC) = 3
	statx(0, NULL, AT_STATX_SYNC_AS_STAT, STATX_ALL, NULL) = -1 EFAULT (Bad address)
	statx(3, "", AT_STATX_SYNC_AS_STAT|AT_EMPTY_PATH, STATX_ALL, {stx_mask=STATX_BASIC_STATS|STATX_MNT_ID, stx_attributes=0, stx_mode=S_IFREG|0444, stx_size=0, ...}) = 0
	lseek(3, 0, SEEK_CUR)                   = 0
	read(3, "BOOT_IMAGE=(hd3,gpt2)/vmlinuz-5.", 32) = 32
	read(3, "19.6-100.fc35.x86_64 root=/dev/m", 32) = 32
	read(3, "apper/fedora_localhost--live-roo"..., 64) = 64
	read(3, "ocalhost--live-swap rd.lvm.lv=fe"..., 128) = 116
	read(3, "", 12)

open/stat is OK, lseek looks silly but there are 3 unnecessary reads
because Rust starts with 32 bytes per Vec<u8> and grows from there.

In case of /proc/cmdline, the length is known precisely.

Make variables readonly while I'm at it.

P.S.: I tried to scp /proc/cpuinfo today and got empty file
	but this is separate story.

Link: https://lkml.kernel.org/r/YxoywlbM73JJN3r+@localhost.localdomain
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/cmdline.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c
index fa762c5fbcb2..91fe1597af7b 100644
--- a/fs/proc/cmdline.c
+++ b/fs/proc/cmdline.c
@@ -3,6 +3,7 @@
 #include <linux/init.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include "internal.h"
 
 static int cmdline_proc_show(struct seq_file *m, void *v)
 {
@@ -13,7 +14,10 @@ static int cmdline_proc_show(struct seq_file *m, void *v)
 
 static int __init proc_cmdline_init(void)
 {
-	proc_create_single("cmdline", 0, NULL, cmdline_proc_show);
+	struct proc_dir_entry *pde;
+
+	pde = proc_create_single("cmdline", 0, NULL, cmdline_proc_show);
+	pde->size = saved_command_line_len + 1;
 	return 0;
 }
 fs_initcall(proc_cmdline_init);
-- 
cgit 


From 610a2a3d7d8be3537458a378ec69396a76c385b6 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Thu, 27 Oct 2022 13:43:05 +0900
Subject: nilfs2: fix shift-out-of-bounds/overflow in nilfs_sb2_bad_offset()

Patch series "nilfs2: fix UBSAN shift-out-of-bounds warnings on mount
time".

The first patch fixes a bug reported by syzbot, and the second one fixes
the remaining bug of the same kind.  Although they are triggered by the
same super block data anomaly, I divided it into the above two because the
details of the issues and how to fix it are different.

Both are required to eliminate the shift-out-of-bounds issues at mount
time.


This patch (of 2):

If the block size exponent information written in an on-disk superblock is
corrupted, nilfs_sb2_bad_offset helper function can trigger
shift-out-of-bounds warning followed by a kernel panic (if panic_on_warn
is set):

 shift exponent 38983 is too large for 64-bit type 'unsigned long long'
 Call Trace:
  <TASK>
  __dump_stack lib/dump_stack.c:88 [inline]
  dump_stack_lvl+0x1b1/0x28e lib/dump_stack.c:106
  ubsan_epilogue lib/ubsan.c:151 [inline]
  __ubsan_handle_shift_out_of_bounds+0x33d/0x3b0 lib/ubsan.c:322
  nilfs_sb2_bad_offset fs/nilfs2/the_nilfs.c:449 [inline]
  nilfs_load_super_block+0xdf5/0xe00 fs/nilfs2/the_nilfs.c:523
  init_nilfs+0xb7/0x7d0 fs/nilfs2/the_nilfs.c:577
  nilfs_fill_super+0xb1/0x5d0 fs/nilfs2/super.c:1047
  nilfs_mount+0x613/0x9b0 fs/nilfs2/super.c:1317
  ...

In addition, since nilfs_sb2_bad_offset() performs multiplication without
considering the upper bound, the computation may overflow if the disk
layout parameters are not normal.

This fixes these issues by inserting preliminary sanity checks for those
parameters and by converting the comparison from one involving
multiplication and left bit-shifting to one using division and right
bit-shifting.

Link: https://lkml.kernel.org/r/20221027044306.42774-1-konishi.ryusuke@gmail.com
Link: https://lkml.kernel.org/r/20221027044306.42774-2-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Reported-by: syzbot+e91619dd4c11c4960706@syzkaller.appspotmail.com
Tested-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/the_nilfs.c | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 3b4a079c9617..d588816fdf2f 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -13,6 +13,7 @@
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/random.h>
+#include <linux/log2.h>
 #include <linux/crc32.h>
 #include "nilfs.h"
 #include "segment.h"
@@ -443,11 +444,33 @@ static int nilfs_valid_sb(struct nilfs_super_block *sbp)
 	return crc == le32_to_cpu(sbp->s_sum);
 }
 
-static int nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset)
+/**
+ * nilfs_sb2_bad_offset - check the location of the second superblock
+ * @sbp: superblock raw data buffer
+ * @offset: byte offset of second superblock calculated from device size
+ *
+ * nilfs_sb2_bad_offset() checks if the position on the second
+ * superblock is valid or not based on the filesystem parameters
+ * stored in @sbp.  If @offset points to a location within the segment
+ * area, or if the parameters themselves are not normal, it is
+ * determined to be invalid.
+ *
+ * Return Value: true if invalid, false if valid.
+ */
+static bool nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset)
 {
-	return offset < ((le64_to_cpu(sbp->s_nsegments) *
-			  le32_to_cpu(sbp->s_blocks_per_segment)) <<
-			 (le32_to_cpu(sbp->s_log_block_size) + 10));
+	unsigned int shift_bits = le32_to_cpu(sbp->s_log_block_size);
+	u32 blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
+	u64 nsegments = le64_to_cpu(sbp->s_nsegments);
+	u64 index;
+
+	if (blocks_per_segment < NILFS_SEG_MIN_BLOCKS ||
+	    shift_bits > ilog2(NILFS_MAX_BLOCK_SIZE) - BLOCK_SIZE_BITS)
+		return true;
+
+	index = offset >> (shift_bits + BLOCK_SIZE_BITS);
+	do_div(index, blocks_per_segment);
+	return index < nsegments;
 }
 
 static void nilfs_release_super_block(struct the_nilfs *nilfs)
-- 
cgit 


From ebeccaaef67a4895d2496ab8d9c2fb8d89201211 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Thu, 27 Oct 2022 13:43:06 +0900
Subject: nilfs2: fix shift-out-of-bounds due to too large exponent of block
 size

If field s_log_block_size of superblock data is corrupted and too large,
init_nilfs() and load_nilfs() still can trigger a shift-out-of-bounds
warning followed by a kernel panic (if panic_on_warn is set):

 shift exponent 38973 is too large for 32-bit type 'int'
 Call Trace:
  <TASK>
  dump_stack_lvl+0xcd/0x134
  ubsan_epilogue+0xb/0x50
  __ubsan_handle_shift_out_of_bounds.cold.12+0x17b/0x1f5
  init_nilfs.cold.11+0x18/0x1d [nilfs2]
  nilfs_mount+0x9b5/0x12b0 [nilfs2]
  ...

This fixes the issue by adding and using a new helper function for getting
block size with sanity check.

Link: https://lkml.kernel.org/r/20221027044306.42774-3-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Tested-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/the_nilfs.c | 42 ++++++++++++++++++++++++++++++++++++++----
 1 file changed, 38 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index d588816fdf2f..20ff02b4ef5d 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -193,6 +193,34 @@ static int nilfs_store_log_cursor(struct the_nilfs *nilfs,
 	return ret;
 }
 
+/**
+ * nilfs_get_blocksize - get block size from raw superblock data
+ * @sb: super block instance
+ * @sbp: superblock raw data buffer
+ * @blocksize: place to store block size
+ *
+ * nilfs_get_blocksize() calculates the block size from the block size
+ * exponent information written in @sbp and stores it in @blocksize,
+ * or aborts with an error message if it's too large.
+ *
+ * Return Value: On success, 0 is returned. If the block size is too
+ * large, -EINVAL is returned.
+ */
+static int nilfs_get_blocksize(struct super_block *sb,
+			       struct nilfs_super_block *sbp, int *blocksize)
+{
+	unsigned int shift_bits = le32_to_cpu(sbp->s_log_block_size);
+
+	if (unlikely(shift_bits >
+		     ilog2(NILFS_MAX_BLOCK_SIZE) - BLOCK_SIZE_BITS)) {
+		nilfs_err(sb, "too large filesystem blocksize: 2 ^ %u KiB",
+			  shift_bits);
+		return -EINVAL;
+	}
+	*blocksize = BLOCK_SIZE << shift_bits;
+	return 0;
+}
+
 /**
  * load_nilfs - load and recover the nilfs
  * @nilfs: the_nilfs structure to be released
@@ -246,11 +274,15 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
 		nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime);
 
 		/* verify consistency between two super blocks */
-		blocksize = BLOCK_SIZE << le32_to_cpu(sbp[0]->s_log_block_size);
+		err = nilfs_get_blocksize(sb, sbp[0], &blocksize);
+		if (err)
+			goto scan_error;
+
 		if (blocksize != nilfs->ns_blocksize) {
 			nilfs_warn(sb,
 				   "blocksize differs between two super blocks (%d != %d)",
 				   blocksize, nilfs->ns_blocksize);
+			err = -EINVAL;
 			goto scan_error;
 		}
 
@@ -609,9 +641,11 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
 	if (err)
 		goto failed_sbh;
 
-	blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
-	if (blocksize < NILFS_MIN_BLOCK_SIZE ||
-	    blocksize > NILFS_MAX_BLOCK_SIZE) {
+	err = nilfs_get_blocksize(sb, sbp, &blocksize);
+	if (err)
+		goto failed_sbh;
+
+	if (blocksize < NILFS_MIN_BLOCK_SIZE) {
 		nilfs_err(sb,
 			  "couldn't mount because of unsupported filesystem blocksize %d",
 			  blocksize);
-- 
cgit 


From 80f784098ff44e086f68f0e8c98b6c6da8702ec4 Mon Sep 17 00:00:00 2001
From: Xiaoming Ni <nixiaoming@huawei.com>
Date: Wed, 19 Oct 2022 11:09:29 +0800
Subject: squashfs: add the mount parameter theads=<single|multi|percpu>

Patch series 'squashfs: Add the mount parameter "threads="'.

Currently, Squashfs supports multiple decompressor parallel modes.
However, this mode can be configured only during kernel building and does
not support flexible selection during runtime.

In the current patch set, the mount parameter "threads=" is added to allow
users to select the parallel decompressor mode and configure the number of
decompressors when mounting a file system.

"threads=<single|multi|percpu|1|2|3|...>"
The upper limit is num_online_cpus() * 2.


This patch (of 2):

Squashfs supports three decompression concurrency modes:
	Single-thread mode: concurrent reads are blocked and the memory
		overhead is small.
	Multi-thread mode/percpu mode: reduces concurrent read blocking but
		increases memory overhead.

The corresponding schema must be fixed at compile time. During mounting,
the concurrent decompression mode cannot be adjusted based on file read
blocking.

The mount parameter theads=<single|multi|percpu> is added to select
the concurrent decompression mode of a single SquashFS file system
image.

Link: https://lkml.kernel.org/r/20221019030930.130456-1-nixiaoming@huawei.com
Link: https://lkml.kernel.org/r/20221019030930.130456-2-nixiaoming@huawei.com
Signed-off-by: Xiaoming Ni <nixiaoming@huawei.com>
Reviewed-by: Phillip Lougher <phillip@squashfs.org.uk>
Cc: Jianguo Chen <chenjianguo3@huawei.com>
Cc: Jubin Zhong <zhongjubin@huawei.com>
Cc: Zhang Yi <yi.zhang@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/squashfs/Kconfig                     | 39 ++++++++++++++++++++---
 fs/squashfs/block.c                     |  2 +-
 fs/squashfs/decompressor.c              |  2 +-
 fs/squashfs/decompressor_multi.c        | 16 +++++++---
 fs/squashfs/decompressor_multi_percpu.c | 23 +++++++++-----
 fs/squashfs/decompressor_single.c       | 15 ++++++---
 fs/squashfs/squashfs.h                  | 23 +++++++++++---
 fs/squashfs/squashfs_fs_sb.h            |  3 +-
 fs/squashfs/super.c                     | 56 +++++++++++++++++++++++++++++++--
 9 files changed, 147 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 916e78fabcaa..218bacdd4298 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -54,9 +54,36 @@ config SQUASHFS_FILE_DIRECT
 
 endchoice
 
+config SQUASHFS_DECOMP_SINGLE
+	depends on SQUASHFS
+	def_bool n
+
+config SQUASHFS_DECOMP_MULTI
+	depends on SQUASHFS
+	def_bool n
+
+config SQUASHFS_DECOMP_MULTI_PERCPU
+	depends on SQUASHFS
+	def_bool n
+
+config SQUASHFS_CHOICE_DECOMP_BY_MOUNT
+	bool "Select the parallel decompression mode during mount"
+	depends on SQUASHFS
+	default n
+	select SQUASHFS_DECOMP_SINGLE
+	select SQUASHFS_DECOMP_MULTI
+	select SQUASHFS_DECOMP_MULTI_PERCPU
+	help
+	  Compile all parallel decompression modes and specify the
+	  decompression mode by setting "threads=" during mount.
+	    threads=<single|multi|percpu>
+
+	  default Decompressor parallelisation is SQUASHFS_DECOMP_SINGLE
+
 choice
-	prompt "Decompressor parallelisation options"
+	prompt "Select decompression parallel mode at compile time"
 	depends on SQUASHFS
+	depends on !SQUASHFS_CHOICE_DECOMP_BY_MOUNT
 	help
 	  Squashfs now supports three parallelisation options for
 	  decompression.  Each one exhibits various trade-offs between
@@ -64,15 +91,17 @@ choice
 
 	  If in doubt, select "Single threaded compression"
 
-config SQUASHFS_DECOMP_SINGLE
+config SQUASHFS_COMPILE_DECOMP_SINGLE
 	bool "Single threaded compression"
+	select SQUASHFS_DECOMP_SINGLE
 	help
 	  Traditionally Squashfs has used single-threaded decompression.
 	  Only one block (data or metadata) can be decompressed at any
 	  one time.  This limits CPU and memory usage to a minimum.
 
-config SQUASHFS_DECOMP_MULTI
+config SQUASHFS_COMPILE_DECOMP_MULTI
 	bool "Use multiple decompressors for parallel I/O"
+	select SQUASHFS_DECOMP_MULTI
 	help
 	  By default Squashfs uses a single decompressor but it gives
 	  poor performance on parallel I/O workloads when using multiple CPU
@@ -85,8 +114,9 @@ config SQUASHFS_DECOMP_MULTI
 	  decompressors per core.  It dynamically allocates decompressors
 	  on a demand basis.
 
-config SQUASHFS_DECOMP_MULTI_PERCPU
+config SQUASHFS_COMPILE_DECOMP_MULTI_PERCPU
 	bool "Use percpu multiple decompressors for parallel I/O"
+	select SQUASHFS_DECOMP_MULTI_PERCPU
 	help
 	  By default Squashfs uses a single decompressor but it gives
 	  poor performance on parallel I/O workloads when using multiple CPU
@@ -95,7 +125,6 @@ config SQUASHFS_DECOMP_MULTI_PERCPU
 	  This decompressor implementation uses a maximum of one
 	  decompressor per core.  It uses percpu variables to ensure
 	  decompression is load-balanced across the cores.
-
 endchoice
 
 config SQUASHFS_XATTR
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 833aca92301f..bed3bb8b27fa 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -216,7 +216,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length,
 			res = -EIO;
 			goto out_free_bio;
 		}
-		res = squashfs_decompress(msblk, bio, offset, length, output);
+		res = msblk->thread_ops->decompress(msblk, bio, offset, length, output);
 	} else {
 		res = copy_bio_to_actor(bio, output, offset, length);
 	}
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index d57bef91ab08..8893cb9b4198 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -134,7 +134,7 @@ void *squashfs_decompressor_setup(struct super_block *sb, unsigned short flags)
 	if (IS_ERR(comp_opts))
 		return comp_opts;
 
-	stream = squashfs_decompressor_create(msblk, comp_opts);
+	stream = msblk->thread_ops->create(msblk, comp_opts);
 	if (IS_ERR(stream))
 		kfree(comp_opts);
 
diff --git a/fs/squashfs/decompressor_multi.c b/fs/squashfs/decompressor_multi.c
index db9f12a3ea05..eb25432bd149 100644
--- a/fs/squashfs/decompressor_multi.c
+++ b/fs/squashfs/decompressor_multi.c
@@ -29,12 +29,11 @@
 #define MAX_DECOMPRESSOR	(num_online_cpus() * 2)
 
 
-int squashfs_max_decompressors(void)
+static int squashfs_max_decompressors(void)
 {
 	return MAX_DECOMPRESSOR;
 }
 
-
 struct squashfs_stream {
 	void			*comp_opts;
 	struct list_head	strm_list;
@@ -59,7 +58,7 @@ static void put_decomp_stream(struct decomp_stream *decomp_strm,
 	wake_up(&stream->wait);
 }
 
-void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
+static void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
 				void *comp_opts)
 {
 	struct squashfs_stream *stream;
@@ -103,7 +102,7 @@ out:
 }
 
 
-void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
+static void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
 {
 	struct squashfs_stream *stream = msblk->stream;
 	if (stream) {
@@ -180,7 +179,7 @@ wait:
 }
 
 
-int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
+static int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
 			int offset, int length,
 			struct squashfs_page_actor *output)
 {
@@ -195,3 +194,10 @@ int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
 			msblk->decompressor->name);
 	return res;
 }
+
+const struct squashfs_decompressor_thread_ops squashfs_decompressor_multi = {
+	.create = squashfs_decompressor_create,
+	.destroy = squashfs_decompressor_destroy,
+	.decompress = squashfs_decompress,
+	.max_decompressors = squashfs_max_decompressors,
+};
diff --git a/fs/squashfs/decompressor_multi_percpu.c b/fs/squashfs/decompressor_multi_percpu.c
index b881b9283b7f..1dfadf76ed9a 100644
--- a/fs/squashfs/decompressor_multi_percpu.c
+++ b/fs/squashfs/decompressor_multi_percpu.c
@@ -25,7 +25,7 @@ struct squashfs_stream {
 	local_lock_t	lock;
 };
 
-void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
+static void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
 						void *comp_opts)
 {
 	struct squashfs_stream *stream;
@@ -59,7 +59,7 @@ out:
 	return ERR_PTR(err);
 }
 
-void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
+static void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
 {
 	struct squashfs_stream __percpu *percpu =
 			(struct squashfs_stream __percpu *) msblk->stream;
@@ -75,19 +75,21 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
 	}
 }
 
-int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
+static int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
 	int offset, int length, struct squashfs_page_actor *output)
 {
 	struct squashfs_stream *stream;
+	struct squashfs_stream __percpu *percpu =
+			(struct squashfs_stream __percpu *) msblk->stream;
 	int res;
 
-	local_lock(&msblk->stream->lock);
-	stream = this_cpu_ptr(msblk->stream);
+	local_lock(&percpu->lock);
+	stream = this_cpu_ptr(percpu);
 
 	res = msblk->decompressor->decompress(msblk, stream->stream, bio,
 					      offset, length, output);
 
-	local_unlock(&msblk->stream->lock);
+	local_unlock(&percpu->lock);
 
 	if (res < 0)
 		ERROR("%s decompression failed, data probably corrupt\n",
@@ -96,7 +98,14 @@ int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
 	return res;
 }
 
-int squashfs_max_decompressors(void)
+static int squashfs_max_decompressors(void)
 {
 	return num_possible_cpus();
 }
+
+const struct squashfs_decompressor_thread_ops squashfs_decompressor_percpu = {
+	.create = squashfs_decompressor_create,
+	.destroy = squashfs_decompressor_destroy,
+	.decompress = squashfs_decompress,
+	.max_decompressors = squashfs_max_decompressors,
+};
diff --git a/fs/squashfs/decompressor_single.c b/fs/squashfs/decompressor_single.c
index 4eb3d083d45e..6f161887710b 100644
--- a/fs/squashfs/decompressor_single.c
+++ b/fs/squashfs/decompressor_single.c
@@ -24,7 +24,7 @@ struct squashfs_stream {
 	struct mutex	mutex;
 };
 
-void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
+static void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
 						void *comp_opts)
 {
 	struct squashfs_stream *stream;
@@ -49,7 +49,7 @@ out:
 	return ERR_PTR(err);
 }
 
-void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
+static void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
 {
 	struct squashfs_stream *stream = msblk->stream;
 
@@ -59,7 +59,7 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
 	}
 }
 
-int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
+static int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
 			int offset, int length,
 			struct squashfs_page_actor *output)
 {
@@ -78,7 +78,14 @@ int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
 	return res;
 }
 
-int squashfs_max_decompressors(void)
+static int squashfs_max_decompressors(void)
 {
 	return 1;
 }
+
+const struct squashfs_decompressor_thread_ops squashfs_decompressor_single = {
+	.create = squashfs_decompressor_create,
+	.destroy = squashfs_decompressor_destroy,
+	.decompress = squashfs_decompress,
+	.max_decompressors = squashfs_max_decompressors,
+};
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 9783e01c8100..a6164fdf9435 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -38,11 +38,24 @@ extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int);
 extern void *squashfs_decompressor_setup(struct super_block *, unsigned short);
 
 /* decompressor_xxx.c */
-extern void *squashfs_decompressor_create(struct squashfs_sb_info *, void *);
-extern void squashfs_decompressor_destroy(struct squashfs_sb_info *);
-extern int squashfs_decompress(struct squashfs_sb_info *, struct bio *,
-				int, int, struct squashfs_page_actor *);
-extern int squashfs_max_decompressors(void);
+
+struct squashfs_decompressor_thread_ops {
+	void * (*create)(struct squashfs_sb_info *msblk, void *comp_opts);
+	void (*destroy)(struct squashfs_sb_info *msblk);
+	int (*decompress)(struct squashfs_sb_info *msblk, struct bio *bio,
+			  int offset, int length, struct squashfs_page_actor *output);
+	int (*max_decompressors)(void);
+};
+
+#ifdef CONFIG_SQUASHFS_DECOMP_SINGLE
+extern const struct squashfs_decompressor_thread_ops squashfs_decompressor_single;
+#endif
+#ifdef CONFIG_SQUASHFS_DECOMP_MULTI
+extern const struct squashfs_decompressor_thread_ops squashfs_decompressor_multi;
+#endif
+#ifdef CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU
+extern const struct squashfs_decompressor_thread_ops squashfs_decompressor_percpu;
+#endif
 
 /* export.c */
 extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, u64,
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 1e90c2575f9b..f1e5dad8ae0a 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -53,7 +53,7 @@ struct squashfs_sb_info {
 	__le64					*xattr_id_table;
 	struct mutex				meta_index_mutex;
 	struct meta_index			*meta_index;
-	struct squashfs_stream			*stream;
+	void					*stream;
 	__le64					*inode_lookup_table;
 	u64					inode_table;
 	u64					directory_table;
@@ -66,5 +66,6 @@ struct squashfs_sb_info {
 	int					xattr_ids;
 	unsigned int				ids;
 	bool					panic_on_errors;
+	const struct squashfs_decompressor_thread_ops *thread_ops;
 };
 #endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 32565dafa7f3..aac3ea72a9ba 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -47,10 +47,12 @@ enum Opt_errors {
 
 enum squashfs_param {
 	Opt_errors,
+	Opt_threads,
 };
 
 struct squashfs_mount_opts {
 	enum Opt_errors errors;
+	const struct squashfs_decompressor_thread_ops *thread_ops;
 };
 
 static const struct constant_table squashfs_param_errors[] = {
@@ -61,9 +63,29 @@ static const struct constant_table squashfs_param_errors[] = {
 
 static const struct fs_parameter_spec squashfs_fs_parameters[] = {
 	fsparam_enum("errors", Opt_errors, squashfs_param_errors),
+	fsparam_string("threads", Opt_threads),
 	{}
 };
 
+static int squashfs_parse_param_threads(const char *str, struct squashfs_mount_opts *opts)
+{
+#ifdef CONFIG_SQUASHFS_CHOICE_DECOMP_BY_MOUNT
+	if (strcmp(str, "single") == 0) {
+		opts->thread_ops = &squashfs_decompressor_single;
+		return 0;
+	}
+	if (strcmp(str, "multi") == 0) {
+		opts->thread_ops = &squashfs_decompressor_multi;
+		return 0;
+	}
+	if (strcmp(str, "percpu") == 0) {
+		opts->thread_ops = &squashfs_decompressor_percpu;
+		return 0;
+	}
+#endif
+	return -EINVAL;
+}
+
 static int squashfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
 {
 	struct squashfs_mount_opts *opts = fc->fs_private;
@@ -78,6 +100,10 @@ static int squashfs_parse_param(struct fs_context *fc, struct fs_parameter *para
 	case Opt_errors:
 		opts->errors = result.uint_32;
 		break;
+	case Opt_threads:
+		if (squashfs_parse_param_threads(param->string, opts) != 0)
+			return -EINVAL;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -167,6 +193,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
 			       sb->s_bdev);
 		goto failed_mount;
 	}
+	msblk->thread_ops = opts->thread_ops;
 
 	/* Check the MAJOR & MINOR versions and lookup compression type */
 	msblk->decompressor = supported_squashfs_filesystem(
@@ -252,7 +279,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
 
 	/* Allocate read_page block */
 	msblk->read_page = squashfs_cache_init("data",
-		squashfs_max_decompressors(), msblk->block_size);
+		msblk->thread_ops->max_decompressors(), msblk->block_size);
 	if (msblk->read_page == NULL) {
 		errorf(fc, "Failed to allocate read_page block");
 		goto failed_mount;
@@ -383,7 +410,7 @@ failed_mount:
 	squashfs_cache_delete(msblk->block_cache);
 	squashfs_cache_delete(msblk->fragment_cache);
 	squashfs_cache_delete(msblk->read_page);
-	squashfs_decompressor_destroy(msblk);
+	msblk->thread_ops->destroy(msblk);
 	kfree(msblk->inode_lookup_table);
 	kfree(msblk->fragment_index);
 	kfree(msblk->id_table);
@@ -435,6 +462,20 @@ static int squashfs_show_options(struct seq_file *s, struct dentry *root)
 	else
 		seq_puts(s, ",errors=continue");
 
+#ifdef CONFIG_SQUASHFS_CHOICE_DECOMP_BY_MOUNT
+	if (msblk->thread_ops == &squashfs_decompressor_single) {
+		seq_puts(s, ",threads=single");
+		return 0;
+	}
+	if (msblk->thread_ops == &squashfs_decompressor_multi) {
+		seq_puts(s, ",threads=multi");
+		return 0;
+	}
+	if (msblk->thread_ops == &squashfs_decompressor_percpu) {
+		seq_puts(s, ",threads=percpu");
+		return 0;
+	}
+#endif
 	return 0;
 }
 
@@ -446,6 +487,15 @@ static int squashfs_init_fs_context(struct fs_context *fc)
 	if (!opts)
 		return -ENOMEM;
 
+#ifdef CONFIG_SQUASHFS_DECOMP_SINGLE
+	opts->thread_ops = &squashfs_decompressor_single;
+#elif defined(CONFIG_SQUASHFS_DECOMP_MULTI)
+	opts->thread_ops = &squashfs_decompressor_multi;
+#elif defined(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU)
+	opts->thread_ops = &squashfs_decompressor_percpu;
+#else
+#error "fail: unknown squashfs decompression thread mode?"
+#endif
 	fc->fs_private = opts;
 	fc->ops = &squashfs_context_ops;
 	return 0;
@@ -478,7 +528,7 @@ static void squashfs_put_super(struct super_block *sb)
 		squashfs_cache_delete(sbi->block_cache);
 		squashfs_cache_delete(sbi->fragment_cache);
 		squashfs_cache_delete(sbi->read_page);
-		squashfs_decompressor_destroy(sbi);
+		sbi->thread_ops->destroy(sbi);
 		kfree(sbi->id_table);
 		kfree(sbi->fragment_index);
 		kfree(sbi->meta_index);
-- 
cgit 


From fb40fe04f9df23114782d5edd1c5d017ae9d0ca8 Mon Sep 17 00:00:00 2001
From: Xiaoming Ni <nixiaoming@huawei.com>
Date: Wed, 19 Oct 2022 11:09:30 +0800
Subject: squashfs: allows users to configure the number of decompression
 threads

The maximum number of threads in the decompressor_multi.c file is fixed
and cannot be adjusted according to user needs.  Therefore, the mount
parameter needs to be added to allow users to configure the number of
threads as required.  The upper limit is num_online_cpus() * 2.

Link: https://lkml.kernel.org/r/20221019030930.130456-3-nixiaoming@huawei.com
Signed-off-by: Xiaoming Ni <nixiaoming@huawei.com>
Reviewed-by: Phillip Lougher <phillip@squashfs.org.uk>
Cc: Jianguo Chen <chenjianguo3@huawei.com>
Cc: Jubin Zhong <zhongjubin@huawei.com>
Cc: Zhang Yi <yi.zhang@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/squashfs/Kconfig              | 16 ++++++++++--
 fs/squashfs/decompressor_multi.c |  4 +--
 fs/squashfs/squashfs_fs_sb.h     |  1 +
 fs/squashfs/super.c              | 55 +++++++++++++++++++++++++++++++++++-----
 4 files changed, 66 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 218bacdd4298..60fc98bdf421 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -73,11 +73,10 @@ config SQUASHFS_CHOICE_DECOMP_BY_MOUNT
 	select SQUASHFS_DECOMP_SINGLE
 	select SQUASHFS_DECOMP_MULTI
 	select SQUASHFS_DECOMP_MULTI_PERCPU
+	select SQUASHFS_MOUNT_DECOMP_THREADS
 	help
 	  Compile all parallel decompression modes and specify the
 	  decompression mode by setting "threads=" during mount.
-	    threads=<single|multi|percpu>
-
 	  default Decompressor parallelisation is SQUASHFS_DECOMP_SINGLE
 
 choice
@@ -127,6 +126,19 @@ config SQUASHFS_COMPILE_DECOMP_MULTI_PERCPU
 	  decompression is load-balanced across the cores.
 endchoice
 
+config SQUASHFS_MOUNT_DECOMP_THREADS
+	bool "Add the mount parameter 'threads=' for squashfs"
+	depends on SQUASHFS
+	depends on SQUASHFS_DECOMP_MULTI
+	default n
+	help
+	  Use threads= to set the decompression parallel mode and the number of threads.
+	  If SQUASHFS_CHOICE_DECOMP_BY_MOUNT=y
+	      threads=<single|multi|percpu|1|2|3|...>
+	  else
+	      threads=<2|3|...>
+	  The upper limit is num_online_cpus() * 2.
+
 config SQUASHFS_XATTR
 	bool "Squashfs XATTR support"
 	depends on SQUASHFS
diff --git a/fs/squashfs/decompressor_multi.c b/fs/squashfs/decompressor_multi.c
index eb25432bd149..416c53eedbd1 100644
--- a/fs/squashfs/decompressor_multi.c
+++ b/fs/squashfs/decompressor_multi.c
@@ -144,7 +144,7 @@ static struct decomp_stream *get_decomp_stream(struct squashfs_sb_info *msblk,
 		 * If there is no available decomp and already full,
 		 * let's wait for releasing decomp from other users.
 		 */
-		if (stream->avail_decomp >= MAX_DECOMPRESSOR)
+		if (stream->avail_decomp >= msblk->max_thread_num)
 			goto wait;
 
 		/* Let's allocate new decomp */
@@ -160,7 +160,7 @@ static struct decomp_stream *get_decomp_stream(struct squashfs_sb_info *msblk,
 		}
 
 		stream->avail_decomp++;
-		WARN_ON(stream->avail_decomp > MAX_DECOMPRESSOR);
+		WARN_ON(stream->avail_decomp > msblk->max_thread_num);
 
 		mutex_unlock(&stream->mutex);
 		break;
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index f1e5dad8ae0a..659082e9e51d 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -67,5 +67,6 @@ struct squashfs_sb_info {
 	unsigned int				ids;
 	bool					panic_on_errors;
 	const struct squashfs_decompressor_thread_ops *thread_ops;
+	int					max_thread_num;
 };
 #endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index aac3ea72a9ba..1e428ca9414e 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -53,6 +53,7 @@ enum squashfs_param {
 struct squashfs_mount_opts {
 	enum Opt_errors errors;
 	const struct squashfs_decompressor_thread_ops *thread_ops;
+	int thread_num;
 };
 
 static const struct constant_table squashfs_param_errors[] = {
@@ -67,7 +68,8 @@ static const struct fs_parameter_spec squashfs_fs_parameters[] = {
 	{}
 };
 
-static int squashfs_parse_param_threads(const char *str, struct squashfs_mount_opts *opts)
+
+static int squashfs_parse_param_threads_str(const char *str, struct squashfs_mount_opts *opts)
 {
 #ifdef CONFIG_SQUASHFS_CHOICE_DECOMP_BY_MOUNT
 	if (strcmp(str, "single") == 0) {
@@ -86,6 +88,42 @@ static int squashfs_parse_param_threads(const char *str, struct squashfs_mount_o
 	return -EINVAL;
 }
 
+static int squashfs_parse_param_threads_num(const char *str, struct squashfs_mount_opts *opts)
+{
+#ifdef CONFIG_SQUASHFS_MOUNT_DECOMP_THREADS
+	int ret;
+	unsigned long num;
+
+	ret = kstrtoul(str, 0, &num);
+	if (ret != 0)
+		return -EINVAL;
+	if (num > 1) {
+		opts->thread_ops = &squashfs_decompressor_multi;
+		if (num > opts->thread_ops->max_decompressors())
+			return -EINVAL;
+		opts->thread_num = (int)num;
+		return 0;
+	}
+#ifdef CONFIG_SQUASHFS_DECOMP_SINGLE
+	if (num == 1) {
+		opts->thread_ops = &squashfs_decompressor_single;
+		opts->thread_num = 1;
+		return 0;
+	}
+#endif
+#endif /* !CONFIG_SQUASHFS_MOUNT_DECOMP_THREADS */
+	return -EINVAL;
+}
+
+static int squashfs_parse_param_threads(const char *str, struct squashfs_mount_opts *opts)
+{
+	int ret = squashfs_parse_param_threads_str(str, opts);
+
+	if (ret == 0)
+		return ret;
+	return squashfs_parse_param_threads_num(str, opts);
+}
+
 static int squashfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
 {
 	struct squashfs_mount_opts *opts = fc->fs_private;
@@ -194,6 +232,11 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
 		goto failed_mount;
 	}
 	msblk->thread_ops = opts->thread_ops;
+	if (opts->thread_num == 0) {
+		msblk->max_thread_num = msblk->thread_ops->max_decompressors();
+	} else {
+		msblk->max_thread_num = opts->thread_num;
+	}
 
 	/* Check the MAJOR & MINOR versions and lookup compression type */
 	msblk->decompressor = supported_squashfs_filesystem(
@@ -279,7 +322,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
 
 	/* Allocate read_page block */
 	msblk->read_page = squashfs_cache_init("data",
-		msblk->thread_ops->max_decompressors(), msblk->block_size);
+		msblk->max_thread_num, msblk->block_size);
 	if (msblk->read_page == NULL) {
 		errorf(fc, "Failed to allocate read_page block");
 		goto failed_mount;
@@ -467,14 +510,13 @@ static int squashfs_show_options(struct seq_file *s, struct dentry *root)
 		seq_puts(s, ",threads=single");
 		return 0;
 	}
-	if (msblk->thread_ops == &squashfs_decompressor_multi) {
-		seq_puts(s, ",threads=multi");
-		return 0;
-	}
 	if (msblk->thread_ops == &squashfs_decompressor_percpu) {
 		seq_puts(s, ",threads=percpu");
 		return 0;
 	}
+#endif
+#ifdef CONFIG_SQUASHFS_MOUNT_DECOMP_THREADS
+	seq_printf(s, ",threads=%d", msblk->max_thread_num);
 #endif
 	return 0;
 }
@@ -496,6 +538,7 @@ static int squashfs_init_fs_context(struct fs_context *fc)
 #else
 #error "fail: unknown squashfs decompression thread mode?"
 #endif
+	opts->thread_num = 0;
 	fc->fs_private = opts;
 	fc->ops = &squashfs_context_ops;
 	return 0;
-- 
cgit 


From 13b6269dd022aaa69ca8d1df374ab327504121cf Mon Sep 17 00:00:00 2001
From: Shang XiaoJing <shangxiaojing@huawei.com>
Date: Tue, 1 Nov 2022 19:15:33 +0800
Subject: ocfs2: fix memory leak in ocfs2_stack_glue_init()

ocfs2_table_header should be free in ocfs2_stack_glue_init() if
ocfs2_sysfs_init() failed, otherwise kmemleak will report memleak.

BUG: memory leak
unreferenced object 0xffff88810eeb5800 (size 128):
  comm "modprobe", pid 4507, jiffies 4296182506 (age 55.888s)
  hex dump (first 32 bytes):
    c0 40 14 a0 ff ff ff ff 00 00 00 00 01 00 00 00  .@..............
    01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  backtrace:
    [<000000001e59e1cd>] __register_sysctl_table+0xca/0xef0
    [<00000000c04f70f7>] 0xffffffffa0050037
    [<000000001bd12912>] do_one_initcall+0xdb/0x480
    [<0000000064f766c9>] do_init_module+0x1cf/0x680
    [<000000002ba52db0>] load_module+0x6441/0x6f20
    [<000000009772580d>] __do_sys_finit_module+0x12f/0x1c0
    [<00000000380c1f22>] do_syscall_64+0x3f/0x90
    [<000000004cf473bc>] entry_SYSCALL_64_after_hwframe+0x63/0xcd

Link: https://lkml.kernel.org/r/41651ca1-432a-db34-eb97-d35744559de1@linux.alibaba.com
Fixes: 3878f110f71a ("ocfs2: Move the hb_ctl_path sysctl into the stack glue.")
Signed-off-by: Shang XiaoJing <shangxiaojing@huawei.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Gang He <ghe@suse.com>
Cc: Jun Piao <piaojun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/stackglue.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 317126261523..a8d5ca98fa57 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -669,6 +669,8 @@ static struct ctl_table_header *ocfs2_table_header;
 
 static int __init ocfs2_stack_glue_init(void)
 {
+	int ret;
+
 	strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB);
 
 	ocfs2_table_header = register_sysctl("fs/ocfs2/nm", ocfs2_nm_table);
@@ -678,7 +680,11 @@ static int __init ocfs2_stack_glue_init(void)
 		return -ENOMEM; /* or something. */
 	}
 
-	return ocfs2_sysfs_init();
+	ret = ocfs2_sysfs_init();
+	if (ret)
+		unregister_sysctl_table(ocfs2_table_header);
+
+	return ret;
 }
 
 static void __exit ocfs2_stack_glue_exit(void)
-- 
cgit 


From c7e8d3279c984e41165a7b510759bd1771ac3941 Mon Sep 17 00:00:00 2001
From: Baokun Li <libaokun1@huawei.com>
Date: Tue, 1 Nov 2022 15:33:43 +0800
Subject: squashfs: fix null-ptr-deref in squashfs_fill_super

When squashfs_read_table() returns an error or `sb->s_magic !=
SQUASHFS_MAGIC`, enters the error branch and calls
msblk->thread_ops->destroy(msblk) to destroy msblk.  However,
msblk->thread_ops has not been initialized.  Therefore, the following
problem is triggered:

==================================================================
BUG: KASAN: null-ptr-deref in squashfs_fill_super+0xe7a/0x13b0
Read of size 8 at addr 0000000000000008 by task swapper/0/1

CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.1.0-rc3-next-20221031 #367
Call Trace:
 <TASK>
 dump_stack_lvl+0x73/0x9f
 print_report+0x743/0x759
 kasan_report+0xc0/0x120
 __asan_load8+0xd3/0x140
 squashfs_fill_super+0xe7a/0x13b0
 get_tree_bdev+0x27b/0x450
 squashfs_get_tree+0x19/0x30
 vfs_get_tree+0x49/0x150
 path_mount+0xaae/0x1350
 init_mount+0xad/0x100
 do_mount_root+0xbc/0x1d0
 mount_block_root+0x173/0x316
 mount_root+0x223/0x236
 prepare_namespace+0x1eb/0x237
 kernel_init_freeable+0x528/0x576
 kernel_init+0x29/0x250
 ret_from_fork+0x1f/0x30
 </TASK>
==================================================================

To solve this issue, msblk->thread_ops is initialized immediately after
msblk is assigned a value.

Link: https://lkml.kernel.org/r/20221101073343.3961562-1-libaokun1@huawei.com
Fixes: b0645770d3c7 ("squashfs: add the mount parameter theads=<single|multi|percpu>")
Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Xiaoming Ni <nixiaoming@huawei.com>
Reviewed-by: Phillip Lougher <phillip@squashfs.org.uk>
Cc: Yu Kuai <yukuai3@huawei.com>
Cc: Zhang Yi <yi.zhang@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/squashfs/super.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 1e428ca9414e..7d5265a39d20 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -197,6 +197,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
 		return -ENOMEM;
 	}
 	msblk = sb->s_fs_info;
+	msblk->thread_ops = opts->thread_ops;
 
 	msblk->panic_on_errors = (opts->errors == Opt_errors_panic);
 
@@ -231,7 +232,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
 			       sb->s_bdev);
 		goto failed_mount;
 	}
-	msblk->thread_ops = opts->thread_ops;
+
 	if (opts->thread_num == 0) {
 		msblk->max_thread_num = msblk->thread_ops->max_decompressors();
 	} else {
-- 
cgit 


From f6fbd8cbf3ed1915a7b957f2801f7c306a686c08 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Wed, 9 Nov 2022 14:14:35 -0500
Subject: lsm,fs: fix vfs_getxattr_alloc() return type and caller error paths

The vfs_getxattr_alloc() function currently returns a ssize_t value
despite the fact that it only uses int values internally for return
values.  Fix this by converting vfs_getxattr_alloc() to return an
int type and adjust the callers as necessary.  As part of these
caller modifications, some of the callers are fixed to properly free
the xattr value buffer on both success and failure to ensure that
memory is not leaked in the failure case.

Reviewed-by: Serge Hallyn <serge@hallyn.com>
Reviewed-by: Mimi Zohar <zohar@linux.ibm.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 fs/xattr.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xattr.c b/fs/xattr.c
index 61107b6bbed2..f38fd969b5fc 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -354,11 +354,12 @@ out_noalloc:
  * vfs_getxattr_alloc - allocate memory, if necessary, before calling getxattr
  *
  * Allocate memory, if not already allocated, or re-allocate correct size,
- * before retrieving the extended attribute.
+ * before retrieving the extended attribute.  The xattr value buffer should
+ * always be freed by the caller, even on error.
  *
  * Returns the result of alloc, if failed, or the getxattr operation.
  */
-ssize_t
+int
 vfs_getxattr_alloc(struct user_namespace *mnt_userns, struct dentry *dentry,
 		   const char *name, char **xattr_value, size_t xattr_size,
 		   gfp_t flags)
-- 
cgit 


From 406c706c7b7f1730aa787e914817b8d16b1e99f6 Mon Sep 17 00:00:00 2001
From: Peter Griffin <peter.griffin@linaro.org>
Date: Thu, 3 Nov 2022 17:02:10 +0000
Subject: vfs: vfs_tmpfile: ensure O_EXCL flag is enforced

If O_EXCL is *not* specified, then linkat() can be
used to link the temporary file into the filesystem.
If O_EXCL is specified then linkat() should fail (-1).

After commit 863f144f12ad ("vfs: open inside ->tmpfile()")
the O_EXCL flag is no longer honored by the vfs layer for
tmpfile, which means the file can be linked even if O_EXCL
flag is specified, which is a change in behaviour for
userspace!

The open flags was previously passed as a parameter, so it
was uneffected by the changes to file->f_flags caused by
finish_open(). This patch fixes the issue by storing
file->f_flags in a local variable so the O_EXCL test
logic is restored.

This regression was detected by Android CTS Bionic fcntl()
tests running on android-mainline [1].

[1] https://android.googlesource.com/platform/bionic/+/
    refs/heads/master/tests/fcntl_test.cpp#352

Fixes: 863f144f12ad ("vfs: open inside ->tmpfile()")
Acked-by: Miklos Szeredi <mszeredi@redhat.com>
Tested-by: Will McVicker <willmcvicker@google.com>
Signed-off-by: Peter Griffin <peter.griffin@linaro.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 578c2110df02..9155ecb547ce 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3591,6 +3591,7 @@ static int vfs_tmpfile(struct user_namespace *mnt_userns,
 	struct inode *dir = d_inode(parentpath->dentry);
 	struct inode *inode;
 	int error;
+	int open_flag = file->f_flags;
 
 	/* we want directory to be writable */
 	error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
@@ -3613,7 +3614,7 @@ static int vfs_tmpfile(struct user_namespace *mnt_userns,
 	if (error)
 		return error;
 	inode = file_inode(file);
-	if (!(file->f_flags & O_EXCL)) {
+	if (!(open_flag & O_EXCL)) {
 		spin_lock(&inode->i_lock);
 		inode->i_state |= I_LINKABLE;
 		spin_unlock(&inode->i_lock);
-- 
cgit 


From f391d6ee002ea022c62dc0b09d0578f3ccce81be Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Fri, 18 Nov 2022 14:48:00 +0300
Subject: cifs: Use after free in debug code

This debug code dereferences "old_iface" after it was already freed by
the call to release_iface().  Re-order the debugging to avoid this
issue.

Fixes: b54034a73baf ("cifs: during reconnect, update interface if necessary")
Cc: stable@vger.kernel.org # 5.19+
Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz>
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/cifs/sess.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 92e4278ec35d..9e7d9f0baa18 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -302,14 +302,14 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
 
 	/* now drop the ref to the current iface */
 	if (old_iface && iface) {
-		kref_put(&old_iface->refcount, release_iface);
 		cifs_dbg(FYI, "replacing iface: %pIS with %pIS\n",
 			 &old_iface->sockaddr,
 			 &iface->sockaddr);
-	} else if (old_iface) {
 		kref_put(&old_iface->refcount, release_iface);
+	} else if (old_iface) {
 		cifs_dbg(FYI, "releasing ref to iface: %pIS\n",
 			 &old_iface->sockaddr);
+		kref_put(&old_iface->refcount, release_iface);
 	} else {
 		WARN_ON(!iface);
 		cifs_dbg(FYI, "adding new iface: %pIS\n", &iface->sockaddr);
-- 
cgit 


From bc943f4872a722c5cc64d1cf41daaaf4ec63158e Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 16 Nov 2022 19:08:23 +0100
Subject: ext2: Don't flush page immediately for DIRSYNC directories

We do not need to writeout modified directory blocks immediately when
modifying them while the page is locked. It is enough to do the flush
somewhat later which has the added benefit that inode times can be
flushed as well. It also allows us to stop depending on
write_one_page() function.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext2/dir.c | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 27873733ed8c..6fa714dbee84 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -81,11 +81,10 @@ ext2_last_byte(struct inode *inode, unsigned long page_nr)
 	return last_byte;
 }
 
-static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
+static void ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
 {
 	struct address_space *mapping = page->mapping;
 	struct inode *dir = mapping->host;
-	int err = 0;
 
 	inode_inc_iversion(dir);
 	block_write_end(NULL, mapping, pos, len, len, page, NULL);
@@ -94,16 +93,7 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
 		i_size_write(dir, pos+len);
 		mark_inode_dirty(dir);
 	}
-
-	if (IS_DIRSYNC(dir)) {
-		err = write_one_page(page);
-		if (!err)
-			err = sync_inode_metadata(dir, 1);
-	} else {
-		unlock_page(page);
-	}
-
-	return err;
+	unlock_page(page);
 }
 
 static bool ext2_check_page(struct page *page, int quiet, char *kaddr)
@@ -460,6 +450,17 @@ static int ext2_prepare_chunk(struct page *page, loff_t pos, unsigned len)
 	return __block_write_begin(page, pos, len, ext2_get_block);
 }
 
+
+static int ext2_handle_dirsync(struct inode *dir)
+{
+	int err;
+
+	err = filemap_write_and_wait(dir->i_mapping);
+	if (!err)
+		err = sync_inode_metadata(dir, 1);
+	return err;
+}
+
 void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
 		   struct page *page, void *page_addr, struct inode *inode,
 		   int update_times)
@@ -474,11 +475,12 @@ void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
 	BUG_ON(err);
 	de->inode = cpu_to_le32(inode->i_ino);
 	ext2_set_de_type(de, inode);
-	err = ext2_commit_chunk(page, pos, len);
+	ext2_commit_chunk(page, pos, len);
 	if (update_times)
 		dir->i_mtime = dir->i_ctime = current_time(dir);
 	EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
 	mark_inode_dirty(dir);
+	ext2_handle_dirsync(dir);
 }
 
 /*
@@ -566,10 +568,11 @@ got_it:
 	memcpy(de->name, name, namelen);
 	de->inode = cpu_to_le32(inode->i_ino);
 	ext2_set_de_type (de, inode);
-	err = ext2_commit_chunk(page, pos, rec_len);
+	ext2_commit_chunk(page, pos, rec_len);
 	dir->i_mtime = dir->i_ctime = current_time(dir);
 	EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
 	mark_inode_dirty(dir);
+	err = ext2_handle_dirsync(dir);
 	/* OFFSET_CACHE */
 out_put:
 	ext2_put_page(page, page_addr);
@@ -615,10 +618,11 @@ int ext2_delete_entry (struct ext2_dir_entry_2 *dir, struct page *page,
 	if (pde)
 		pde->rec_len = ext2_rec_len_to_disk(to - from);
 	dir->inode = 0;
-	err = ext2_commit_chunk(page, pos, to - from);
+	ext2_commit_chunk(page, pos, to - from);
 	inode->i_ctime = inode->i_mtime = current_time(inode);
 	EXT2_I(inode)->i_flags &= ~EXT2_BTREE_FL;
 	mark_inode_dirty(inode);
+	err = ext2_handle_dirsync(inode);
 out:
 	return err;
 }
@@ -658,7 +662,8 @@ int ext2_make_empty(struct inode *inode, struct inode *parent)
 	memcpy (de->name, "..\0", 4);
 	ext2_set_de_type (de, inode);
 	kunmap_atomic(kaddr);
-	err = ext2_commit_chunk(page, 0, chunk_size);
+	ext2_commit_chunk(page, 0, chunk_size);
+	err = ext2_handle_dirsync(inode);
 fail:
 	put_page(page);
 	return err;
-- 
cgit 


From a27c442d61cea70f38d9340528225b234888885b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 13 Nov 2022 17:28:55 +0100
Subject: ext2: remove ->writepage

->writepage is a very inefficient method to write back data, and only
used through write_cache_pages or as a fallback when no ->migrate_folio
method is present.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext2/inode.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 918ab2f9e4c0..3b2e3e1e0fa2 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -869,11 +869,6 @@ int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	return ret;
 }
 
-static int ext2_writepage(struct page *page, struct writeback_control *wbc)
-{
-	return block_write_full_page(page, ext2_get_block, wbc);
-}
-
 static int ext2_read_folio(struct file *file, struct folio *folio)
 {
 	return mpage_read_folio(folio, ext2_get_block);
@@ -948,7 +943,6 @@ const struct address_space_operations ext2_aops = {
 	.invalidate_folio	= block_invalidate_folio,
 	.read_folio		= ext2_read_folio,
 	.readahead		= ext2_readahead,
-	.writepage		= ext2_writepage,
 	.write_begin		= ext2_write_begin,
 	.write_end		= ext2_write_end,
 	.bmap			= ext2_bmap,
-- 
cgit 


From 36273e5b4e3a934c6d346c8f0b16b97e018094af Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 13 Nov 2022 17:29:02 +0100
Subject: udf: remove ->writepage

->writepage is a very inefficient method to write back data, and only
used through write_cache_pages or as a fallback when no ->migrate_folio
method is present.

Set ->migrate_folio to the generic buffer_head based helper, and remove
the ->writepage implementation in extfat.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/inode.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index dce6ae9ae306..0246b1b86fb9 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -182,11 +182,6 @@ static void udf_write_failed(struct address_space *mapping, loff_t to)
 	}
 }
 
-static int udf_writepage(struct page *page, struct writeback_control *wbc)
-{
-	return block_write_full_page(page, udf_get_block, wbc);
-}
-
 static int udf_writepages(struct address_space *mapping,
 			struct writeback_control *wbc)
 {
@@ -239,12 +234,12 @@ const struct address_space_operations udf_aops = {
 	.invalidate_folio = block_invalidate_folio,
 	.read_folio	= udf_read_folio,
 	.readahead	= udf_readahead,
-	.writepage	= udf_writepage,
 	.writepages	= udf_writepages,
 	.write_begin	= udf_write_begin,
 	.write_end	= generic_write_end,
 	.direct_IO	= udf_direct_IO,
 	.bmap		= udf_bmap,
+	.migrate_folio	= buffer_migrate_folio,
 };
 
 /*
-- 
cgit 


From c51f0e6a1254b3ac2d308e1c6fd8fb936992b455 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 15 Nov 2022 10:39:44 +0100
Subject: btrfs: zoned: fix missing endianness conversion in sb_write_pointer

generation is an on-disk __le64 value, so use btrfs_super_generation to
convert it to host endian before comparing it.

Fixes: 12659251ca5d ("btrfs: implement log-structured superblock for ZONED mode")
CC: stable@vger.kernel.org # 5.15+
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/zoned.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 1912abf6d020..44d8177eeffc 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -134,7 +134,8 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
 			super[i] = page_address(page[i]);
 		}
 
-		if (super[0]->generation > super[1]->generation)
+		if (btrfs_super_generation(super[0]) >
+		    btrfs_super_generation(super[1]))
 			sector = zones[1].start;
 		else
 			sector = zones[0].start;
-- 
cgit 


From a11452a3709e217492798cf3686ac2cc8eb3fb51 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 15 Nov 2022 16:29:44 +0000
Subject: btrfs: send: avoid unaligned encoded writes when attempting to clone
 range

When trying to see if we can clone a file range, there are cases where we
end up sending two write operations in case the inode from the source root
has an i_size that is not sector size aligned and the length from the
current offset to its i_size is less than the remaining length we are
trying to clone.

Issuing two write operations when we could instead issue a single write
operation is not incorrect. However it is not optimal, specially if the
extents are compressed and the flag BTRFS_SEND_FLAG_COMPRESSED was passed
to the send ioctl. In that case we can end up sending an encoded write
with an offset that is not sector size aligned, which makes the receiver
fallback to decompressing the data and writing it using regular buffered
IO (so re-compressing the data in case the fs is mounted with compression
enabled), because encoded writes fail with -EINVAL when an offset is not
sector size aligned.

The following example, which triggered a bug in the receiver code for the
fallback logic of decompressing + regular buffer IO and is fixed by the
patchset referred in a Link at the bottom of this changelog, is an example
where we have the non-optimal behaviour due to an unaligned encoded write:

   $ cat test.sh
   #!/bin/bash

   DEV=/dev/sdj
   MNT=/mnt/sdj

   mkfs.btrfs -f $DEV > /dev/null
   mount -o compress $DEV $MNT

   # File foo has a size of 33K, not aligned to the sector size.
   xfs_io -f -c "pwrite -S 0xab 0 33K" $MNT/foo

   xfs_io -f -c "pwrite -S 0xcd 0 64K" $MNT/bar

   # Now clone the first 32K of file bar into foo at offset 0.
   xfs_io -c "reflink $MNT/bar 0 0 32K" $MNT/foo

   # Snapshot the default subvolume and create a full send stream (v2).
   btrfs subvolume snapshot -r $MNT $MNT/snap

   btrfs send --compressed-data -f /tmp/test.send $MNT/snap

   echo -e "\nFile bar in the original filesystem:"
   od -A d -t x1 $MNT/snap/bar

   umount $MNT
   mkfs.btrfs -f $DEV > /dev/null
   mount $DEV $MNT

   echo -e "\nReceiving stream in a new filesystem..."
   btrfs receive -f /tmp/test.send $MNT

   echo -e "\nFile bar in the new filesystem:"
   od -A d -t x1 $MNT/snap/bar

   umount $MNT

Before this patch, the send stream included one regular write and one
encoded write for file 'bar', with the later being not sector size aligned
and causing the receiver to fallback to decompression + buffered writes.
The output of the btrfs receive command in verbose mode (-vvv):

   (...)
   mkfile o258-7-0
   rename o258-7-0 -> bar
   utimes
   clone bar - source=foo source offset=0 offset=0 length=32768
   write bar - offset=32768 length=1024
   encoded_write bar - offset=33792, len=4096, unencoded_offset=33792, unencoded_file_len=31744, unencoded_len=65536, compression=1, encryption=0
   encoded_write bar - falling back to decompress and write due to errno 22 ("Invalid argument")
   (...)

This patch avoids the regular write followed by an unaligned encoded write
so that we end up sending a single encoded write that is aligned. So after
this patch the stream content is (output of btrfs receive -vvv):

   (...)
   mkfile o258-7-0
   rename o258-7-0 -> bar
   utimes
   clone bar - source=foo source offset=0 offset=0 length=32768
   encoded_write bar - offset=32768, len=4096, unencoded_offset=32768, unencoded_file_len=32768, unencoded_len=65536, compression=1, encryption=0
   (...)

So we get more optimal behaviour and avoid the silent data loss bug in
versions of btrfs-progs affected by the bug referred by the Link tag
below (btrfs-progs v5.19, v5.19.1, v6.0 and v6.0.1).

Link: https://lore.kernel.org/linux-btrfs/cover.1668529099.git.fdmanana@suse.com/
Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 145c84b44fd0..1c4b693ee4a3 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5702,6 +5702,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
 		u64 ext_len;
 		u64 clone_len;
 		u64 clone_data_offset;
+		bool crossed_src_i_size = false;
 
 		if (slot >= btrfs_header_nritems(leaf)) {
 			ret = btrfs_next_leaf(clone_root->root, path);
@@ -5759,8 +5760,10 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
 		if (key.offset >= clone_src_i_size)
 			break;
 
-		if (key.offset + ext_len > clone_src_i_size)
+		if (key.offset + ext_len > clone_src_i_size) {
 			ext_len = clone_src_i_size - key.offset;
+			crossed_src_i_size = true;
+		}
 
 		clone_data_offset = btrfs_file_extent_offset(leaf, ei);
 		if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte) {
@@ -5821,6 +5824,25 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
 				ret = send_clone(sctx, offset, clone_len,
 						 clone_root);
 			}
+		} else if (crossed_src_i_size && clone_len < len) {
+			/*
+			 * If we are at i_size of the clone source inode and we
+			 * can not clone from it, terminate the loop. This is
+			 * to avoid sending two write operations, one with a
+			 * length matching clone_len and the final one after
+			 * this loop with a length of len - clone_len.
+			 *
+			 * When using encoded writes (BTRFS_SEND_FLAG_COMPRESSED
+			 * was passed to the send ioctl), this helps avoid
+			 * sending an encoded write for an offset that is not
+			 * sector size aligned, in case the i_size of the source
+			 * inode is not sector size aligned. That will make the
+			 * receiver fallback to decompression of the data and
+			 * writing it using regular buffered IO, therefore while
+			 * not incorrect, it's not optimal due decompression and
+			 * possible re-compression at the receiver.
+			 */
+			break;
 		} else {
 			ret = send_extent_data(sctx, dst_path, offset,
 					       clone_len);
-- 
cgit 


From f7e942b5bb35d8e3af54053d19a6bf04143a3955 Mon Sep 17 00:00:00 2001
From: ChenXiaoSong <chenxiaosong2@huawei.com>
Date: Wed, 16 Nov 2022 22:23:54 +0800
Subject: btrfs: qgroup: fix sleep from invalid context bug in
 btrfs_qgroup_inherit()

Syzkaller reported BUG as follows:

  BUG: sleeping function called from invalid context at
       include/linux/sched/mm.h:274
  Call Trace:
   <TASK>
   dump_stack_lvl+0xcd/0x134
   __might_resched.cold+0x222/0x26b
   kmem_cache_alloc+0x2e7/0x3c0
   update_qgroup_limit_item+0xe1/0x390
   btrfs_qgroup_inherit+0x147b/0x1ee0
   create_subvol+0x4eb/0x1710
   btrfs_mksubvol+0xfe5/0x13f0
   __btrfs_ioctl_snap_create+0x2b0/0x430
   btrfs_ioctl_snap_create_v2+0x25a/0x520
   btrfs_ioctl+0x2a1c/0x5ce0
   __x64_sys_ioctl+0x193/0x200
   do_syscall_64+0x35/0x80

Fix this by calling qgroup_dirty() on @dstqgroup, and update limit item in
btrfs_run_qgroups() later outside of the spinlock context.

CC: stable@vger.kernel.org # 4.9+
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: ChenXiaoSong <chenxiaosong2@huawei.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 9334c3157c22..b74105a10f16 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2951,14 +2951,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
 		dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
 		dstgroup->rsv_excl = inherit->lim.rsv_excl;
 
-		ret = update_qgroup_limit_item(trans, dstgroup);
-		if (ret) {
-			qgroup_mark_inconsistent(fs_info);
-			btrfs_info(fs_info,
-				   "unable to update quota limit for %llu",
-				   dstgroup->qgroupid);
-			goto unlock;
-		}
+		qgroup_dirty(fs_info, dstgroup);
 	}
 
 	if (srcid) {
-- 
cgit 


From caf1aeaffc3b09649a56769e559333ae2c4f1802 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 20 Nov 2022 10:10:53 -0700
Subject: eventpoll: add EPOLL_URING_WAKE poll wakeup flag

We can have dependencies between epoll and io_uring. Consider an epoll
context, identified by the epfd file descriptor, and an io_uring file
descriptor identified by iofd. If we add iofd to the epfd context, and
arm a multishot poll request for epfd with iofd, then the multishot
poll request will repeatedly trigger and generate events until terminated
by CQ ring overflow. This isn't a desired behavior.

Add EPOLL_URING so that io_uring can pass it in as part of the poll wakeup
key, and io_uring can check for that to detect a potential recursive
invocation.

Cc: stable@vger.kernel.org # 6.0
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/eventpoll.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 52954d4637b5..64659b110973 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -491,7 +491,8 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
  */
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
-static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
+static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi,
+			     unsigned pollflags)
 {
 	struct eventpoll *ep_src;
 	unsigned long flags;
@@ -522,16 +523,17 @@ static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
 	}
 	spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
 	ep->nests = nests + 1;
-	wake_up_locked_poll(&ep->poll_wait, EPOLLIN);
+	wake_up_locked_poll(&ep->poll_wait, EPOLLIN | pollflags);
 	ep->nests = 0;
 	spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
 }
 
 #else
 
-static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
+static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi,
+			     unsigned pollflags)
 {
-	wake_up_poll(&ep->poll_wait, EPOLLIN);
+	wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags);
 }
 
 #endif
@@ -742,7 +744,7 @@ static void ep_free(struct eventpoll *ep)
 
 	/* We need to release all tasks waiting for these file */
 	if (waitqueue_active(&ep->poll_wait))
-		ep_poll_safewake(ep, NULL);
+		ep_poll_safewake(ep, NULL, 0);
 
 	/*
 	 * We need to lock this because we could be hit by
@@ -1208,7 +1210,7 @@ out_unlock:
 
 	/* We have to call this outside the lock */
 	if (pwake)
-		ep_poll_safewake(ep, epi);
+		ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE);
 
 	if (!(epi->event.events & EPOLLEXCLUSIVE))
 		ewake = 1;
@@ -1553,7 +1555,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 
 	/* We have to call this outside the lock */
 	if (pwake)
-		ep_poll_safewake(ep, NULL);
+		ep_poll_safewake(ep, NULL, 0);
 
 	return 0;
 }
@@ -1629,7 +1631,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
 
 	/* We have to call this outside the lock */
 	if (pwake)
-		ep_poll_safewake(ep, NULL);
+		ep_poll_safewake(ep, NULL, 0);
 
 	return 0;
 }
-- 
cgit 


From f217d7ccb9f9ae8d9525958f902c059db1c78355 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Thu, 17 Nov 2022 17:11:40 -0500
Subject: fs: dlm: avoid false-positive checker warning

This patch avoid the false-positive checker warning about writing 112
bytes into a 88 bytes field "e->request", see:

[   54.891560] dlm: csmb1: dlm_recover_directory 23 out 2 messages
[   54.990542] ------------[ cut here ]------------
[   54.991012] memcpy: detected field-spanning write (size 112) of single field "&e->request" at fs/dlm/requestqueue.c:47 (size 88)
[   54.992150] WARNING: CPU: 0 PID: 297 at fs/dlm/requestqueue.c:47 dlm_add_requestqueue+0x177/0x180
[   54.993002] CPU: 0 PID: 297 Comm: kworker/u4:3 Not tainted 6.1.0-rc5-00008-ge01d50cbd6ee #248
[   54.993878] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.0-1.fc36 04/01/2014
[   54.994718] Workqueue: dlm_recv process_recv_sockets
[   54.995230] RIP: 0010:dlm_add_requestqueue+0x177/0x180
[   54.995731] Code: e7 01 0f 85 3b ff ff ff b9 58 00 00 00 48 c7 c2 c0 41 74 82 4c 89 ee 48 c7 c7 20 42 74 82 c6 05 8b 8d 30 02 01 e8 51 07 be 00 <0f> 0b e9 12 ff ff ff 66 90 0f 1f 44 00 00 41 57 48 8d 87 10 08 00
[   54.997483] RSP: 0018:ffffc90000b1fbe8 EFLAGS: 00010282
[   54.997990] RAX: 0000000000000000 RBX: ffff888024fc3d00 RCX: 0000000000000000
[   54.998667] RDX: 0000000000000001 RSI: ffffffff81155014 RDI: fffff52000163f73
[   54.999342] RBP: ffff88800dbac000 R08: 0000000000000001 R09: ffffc90000b1fa5f
[   54.999997] R10: fffff52000163f4b R11: 203a7970636d656d R12: ffff88800cfb0018
[   55.000673] R13: 0000000000000070 R14: ffff888024fc3d18 R15: 0000000000000000
[   55.001344] FS:  0000000000000000(0000) GS:ffff88806d600000(0000) knlGS:0000000000000000
[   55.002078] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   55.002603] CR2: 00007f35d4f0b9a0 CR3: 0000000025495002 CR4: 0000000000770ef0
[   55.003258] PKRU: 55555554
[   55.003514] Call Trace:
[   55.003756]  <TASK>
[   55.003953]  dlm_receive_buffer+0x1c0/0x200
[   55.004348]  dlm_process_incoming_buffer+0x46d/0x780
[   55.004786]  ? kernel_recvmsg+0x8b/0xc0
[   55.005150]  receive_from_sock.isra.0+0x168/0x420
[   55.005582]  ? process_listen_recv_socket+0x10/0x10
[   55.006018]  ? finish_task_switch.isra.0+0xe0/0x400
[   55.006469]  ? __switch_to+0x2fe/0x6a0
[   55.006808]  ? read_word_at_a_time+0xe/0x20
[   55.007197]  ? strscpy+0x146/0x190
[   55.007505]  process_one_work+0x3d0/0x6b0
[   55.007863]  worker_thread+0x8d/0x620
[   55.008209]  ? __kthread_parkme+0xd8/0xf0
[   55.008565]  ? process_one_work+0x6b0/0x6b0
[   55.008937]  kthread+0x171/0x1a0
[   55.009251]  ? kthread_exit+0x60/0x60
[   55.009582]  ret_from_fork+0x1f/0x30
[   55.009903]  </TASK>
[   55.010120] ---[ end trace 0000000000000000 ]---
[   55.025783] dlm: csmb1: dlm_recover 5 generation 3 done: 201 ms
[   55.026466] gfs2: fsid=smbcluster:csmb1.0: recover generation 3 done

It seems the checker is unable to detect the additional length bytes
which was allocated additionally for the flexible array in struct
dlm_message. To solve it we split the memcpy() into copy for the 88 bytes
struct and another memcpy() for the flexible array m_extra field.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/requestqueue.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
index 036a9a0078f6..8be2893ad15b 100644
--- a/fs/dlm/requestqueue.c
+++ b/fs/dlm/requestqueue.c
@@ -44,7 +44,8 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms)
 
 	e->recover_seq = ls->ls_recover_seq & 0xFFFFFFFF;
 	e->nodeid = nodeid;
-	memcpy(&e->request, ms, le16_to_cpu(ms->m_header.h_length));
+	memcpy(&e->request, ms, sizeof(*ms));
+	memcpy(&e->request.m_extra, ms->m_extra, length);
 
 	atomic_inc(&ls->ls_requestqueue_cnt);
 	mutex_lock(&ls->ls_requestqueue_mutex);
-- 
cgit 


From 9267c85769e62c10961451fd28e88de996fdf401 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Thu, 17 Nov 2022 17:11:41 -0500
Subject: fs: dlm: drop lkb ref in bug case

This patch will drop the lkb reference in an very unlikely case which
should in practice not happened. However if it happens we cleanup the
reference just in case.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/ast.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 078bbbd43a53..982d093b80cd 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -186,7 +186,7 @@ void dlm_callback_work(struct work_struct *work)
 	spin_unlock(&lkb->lkb_cb_lock);
 
 	if (WARN_ON(rv == DLM_DEQUEUE_CALLBACK_EMPTY))
-		return;
+		goto out;
 
 	for (;;) {
 		castfn = lkb->lkb_astfn;
@@ -217,6 +217,7 @@ void dlm_callback_work(struct work_struct *work)
 		spin_unlock(&lkb->lkb_cb_lock);
 	}
 
+out:
 	/* undo kref_get from dlm_add_callback, may cause lkb to be freed */
 	dlm_put_lkb(lkb);
 }
-- 
cgit 


From 740bb8fc10d226d64c7da2271cf0b25dab1538dc Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Thu, 17 Nov 2022 17:11:42 -0500
Subject: fs: dlm: ast do WARN_ON_ONCE() on hotpath

This patch changes the ast hotpath functionality in very unlikely cases
that we do WARN_ON_ONCE() instead of WARN_ON() to not spamming the
console output if we run into states that it would occur over and over
again.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/ast.c  | 6 +++---
 fs/dlm/user.c | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 982d093b80cd..7dd072b8ea38 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -161,12 +161,12 @@ void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
 		spin_unlock(&ls->ls_cb_lock);
 		break;
 	case DLM_ENQUEUE_CALLBACK_FAILURE:
-		WARN_ON(1);
+		WARN_ON_ONCE(1);
 		break;
 	case DLM_ENQUEUE_CALLBACK_SUCCESS:
 		break;
 	default:
-		WARN_ON(1);
+		WARN_ON_ONCE(1);
 		break;
 	}
 	spin_unlock(&lkb->lkb_cb_lock);
@@ -185,7 +185,7 @@ void dlm_callback_work(struct work_struct *work)
 	rv = dlm_dequeue_lkb_callback(lkb, &cb);
 	spin_unlock(&lkb->lkb_cb_lock);
 
-	if (WARN_ON(rv == DLM_DEQUEUE_CALLBACK_EMPTY))
+	if (WARN_ON_ONCE(rv == DLM_DEQUEUE_CALLBACK_EMPTY))
 		goto out;
 
 	for (;;) {
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 6b530db4bc0b..edf722d6935a 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -214,7 +214,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
 	switch (rv) {
 	case DLM_ENQUEUE_CALLBACK_FAILURE:
 		spin_unlock(&proc->asts_spin);
-		WARN_ON(1);
+		WARN_ON_ONCE(1);
 		goto out;
 	case DLM_ENQUEUE_CALLBACK_NEED_SCHED:
 		kref_get(&lkb->lkb_ref);
@@ -224,7 +224,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
 	case DLM_ENQUEUE_CALLBACK_SUCCESS:
 		break;
 	default:
-		WARN_ON(1);
+		WARN_ON_ONCE(1);
 		break;
 	}
 	spin_unlock(&proc->asts_spin);
@@ -880,7 +880,7 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
 		spin_unlock(&proc->asts_spin);
 		/* removes ref for proc->asts, may cause lkb to be freed */
 		dlm_put_lkb(lkb);
-		WARN_ON(1);
+		WARN_ON_ONCE(1);
 		goto try_another;
 	case DLM_DEQUEUE_CALLBACK_LAST:
 		list_del_init(&lkb->lkb_cb_list);
@@ -890,7 +890,7 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
 	case DLM_DEQUEUE_CALLBACK_SUCCESS:
 		break;
 	default:
-		WARN_ON(1);
+		WARN_ON_ONCE(1);
 		break;
 	}
 	spin_unlock(&proc->asts_spin);
-- 
cgit 


From 554d849616769339b9dce833a4830251fc4b91ba Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Thu, 17 Nov 2022 17:11:43 -0500
Subject: fs: dlm: rename DLM_IFL_NEED_SCHED to DLM_IFL_CB_PENDING

This patch renames DLM_IFL_NEED_SCHED to DLM_IFL_CB_PENDING because
CB_PENDING is a proper name to describe this flag. This flag is set when
callback enqueue will return DLM_ENQUEUE_CALLBACK_NEED_SCHED because the
callback worker need to be queued. The flag tells that callbacks are
currently pending to be called and will be unset if the callback work
for the specific lkb is done. The term need schedule is part of this
time but a proper name is to say that there are some callbacks pending
to being called.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/ast.c          | 9 ++++-----
 fs/dlm/dlm_internal.h | 2 +-
 fs/dlm/user.c         | 3 +--
 3 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 7dd072b8ea38..26fef9945cc9 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -45,8 +45,7 @@ void dlm_purge_lkb_callbacks(struct dlm_lkb *lkb)
 		kref_put(&cb->ref, dlm_release_callback);
 	}
 
-	/* TODO */
-	lkb->lkb_flags &= ~DLM_IFL_NEED_SCHED;
+	lkb->lkb_flags &= ~DLM_IFL_CB_PENDING;
 
 	/* invalidate */
 	dlm_callback_set_last_ptr(&lkb->lkb_last_cast, NULL);
@@ -104,8 +103,8 @@ int dlm_enqueue_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
 	cb->sb_status = status;
 	cb->sb_flags = (sbflags & 0x000000FF);
 	kref_init(&cb->ref);
-	if (!(lkb->lkb_flags & DLM_IFL_NEED_SCHED)) {
-		lkb->lkb_flags |= DLM_IFL_NEED_SCHED;
+	if (!(lkb->lkb_flags & DLM_IFL_CB_PENDING)) {
+		lkb->lkb_flags |= DLM_IFL_CB_PENDING;
 		rv = DLM_ENQUEUE_CALLBACK_NEED_SCHED;
 	}
 	list_add_tail(&cb->list, &lkb->lkb_callbacks);
@@ -210,7 +209,7 @@ void dlm_callback_work(struct work_struct *work)
 		spin_lock(&lkb->lkb_cb_lock);
 		rv = dlm_dequeue_lkb_callback(lkb, &cb);
 		if (rv == DLM_DEQUEUE_CALLBACK_EMPTY) {
-			lkb->lkb_flags &= ~DLM_IFL_NEED_SCHED;
+			lkb->lkb_flags &= ~DLM_IFL_CB_PENDING;
 			spin_unlock(&lkb->lkb_cb_lock);
 			break;
 		}
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 6318e0f51bc9..ab1a55337a6e 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -211,7 +211,7 @@ struct dlm_args {
 #endif
 #define DLM_IFL_DEADLOCK_CANCEL	0x01000000
 #define DLM_IFL_STUB_MS		0x02000000 /* magic number for m_flags */
-#define DLM_IFL_NEED_SCHED	0x04000000
+#define DLM_IFL_CB_PENDING	0x04000000
 /* least significant 2 bytes are message changed, they are full transmitted
  * but at receive side only the 2 bytes LSB will be set.
  *
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index edf722d6935a..35129505ddda 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -884,8 +884,7 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
 		goto try_another;
 	case DLM_DEQUEUE_CALLBACK_LAST:
 		list_del_init(&lkb->lkb_cb_list);
-		/* TODO */
-		lkb->lkb_flags &= ~DLM_IFL_NEED_SCHED;
+		lkb->lkb_flags &= ~DLM_IFL_CB_PENDING;
 		break;
 	case DLM_DEQUEUE_CALLBACK_SUCCESS:
 		break;
-- 
cgit 


From 17827754e503d6c72b05a1c4603469ec9bf35d48 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Thu, 17 Nov 2022 17:11:45 -0500
Subject: fs: dlm: add dst nodeid for msg tracing

In DLM when we send a dlm message it is easy to add the lock resource
name, but additional lookup is required when to trace the receive
message side. The idea here is to move the lookup work to the user by
using a lookup to find the right send message with recv message. As note
DLM can't drop any message which is guaranteed by a special session
layer.

For doing the lookup a 3 tupel is required as an unique identification
which is dst nodeid, src nodeid and sequence number. This patch adds the
destination nodeid to the dlm message trace points. The source nodeid is
given by the h_nodeid field inside the header.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/midcomms.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index 32194a750fe1..960def5ab530 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -479,10 +479,10 @@ static void dlm_receive_buffer_3_2_trace(uint32_t seq, union dlm_packet *p)
 {
 	switch (p->header.h_cmd) {
 	case DLM_MSG:
-		trace_dlm_recv_message(seq, &p->message);
+		trace_dlm_recv_message(dlm_our_nodeid(), seq, &p->message);
 		break;
 	case DLM_RCOM:
-		trace_dlm_recv_rcom(seq, &p->rcom);
+		trace_dlm_recv_rcom(dlm_our_nodeid(), seq, &p->rcom);
 		break;
 	default:
 		break;
@@ -1151,11 +1151,13 @@ static void dlm_midcomms_commit_msg_3_2_trace(const struct dlm_mhandle *mh,
 {
 	switch (mh->inner_p->header.h_cmd) {
 	case DLM_MSG:
-		trace_dlm_send_message(mh->seq, &mh->inner_p->message,
+		trace_dlm_send_message(mh->node->nodeid, mh->seq,
+				       &mh->inner_p->message,
 				       name, namelen);
 		break;
 	case DLM_RCOM:
-		trace_dlm_send_rcom(mh->seq, &mh->inner_p->rcom);
+		trace_dlm_send_rcom(mh->node->nodeid, mh->seq,
+				    &mh->inner_p->rcom);
 		break;
 	default:
 		/* nothing to trace */
-- 
cgit 


From 8b0188b0d60b6f6183b48380bac49fe080c5ded9 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Thu, 17 Nov 2022 17:11:46 -0500
Subject: fs: dlm: add midcomms init/start functions

This patch introduces leftovers of init, start, stop and exit
functionality. The dlm application layer should always call the midcomms
layer which getting aware of such event and redirect it to the lowcomms
layer. Some functionality which is currently handled inside the start
functionality of midcomms and lowcomms should be handled in the init
functionality as it only need to be initialized once when dlm is loaded.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lockspace.c |  5 ++---
 fs/dlm/lowcomms.c  | 16 ++++++++++------
 fs/dlm/lowcomms.h  |  1 +
 fs/dlm/main.c      |  7 +++++--
 fs/dlm/midcomms.c  | 17 ++++++++++++++++-
 fs/dlm/midcomms.h  |  3 +++
 6 files changed, 37 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 96fe8f7ca09c..d0b4e2181a5f 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -17,7 +17,6 @@
 #include "recoverd.h"
 #include "dir.h"
 #include "midcomms.h"
-#include "lowcomms.h"
 #include "config.h"
 #include "memory.h"
 #include "lock.h"
@@ -723,7 +722,7 @@ static int __dlm_new_lockspace(const char *name, const char *cluster,
 	if (!ls_count) {
 		dlm_scand_stop();
 		dlm_midcomms_shutdown();
-		dlm_lowcomms_stop();
+		dlm_midcomms_stop();
 	}
  out:
 	mutex_unlock(&ls_lock);
@@ -926,7 +925,7 @@ int dlm_release_lockspace(void *lockspace, int force)
 	if (!error)
 		ls_count--;
 	if (!ls_count)
-		dlm_lowcomms_stop();
+		dlm_midcomms_stop();
 	mutex_unlock(&ls_lock);
 
 	return error;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index b05c6d9b5102..0dcb69cc456a 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1987,10 +1987,6 @@ static const struct dlm_proto_ops dlm_sctp_ops = {
 int dlm_lowcomms_start(void)
 {
 	int error = -EINVAL;
-	int i;
-
-	for (i = 0; i < CONN_HASH_SIZE; i++)
-		INIT_HLIST_HEAD(&connection_hash[i]);
 
 	init_local();
 	if (!dlm_local_count) {
@@ -1999,8 +1995,6 @@ int dlm_lowcomms_start(void)
 		goto fail;
 	}
 
-	INIT_WORK(&listen_con.rwork, process_listen_recv_socket);
-
 	error = work_start();
 	if (error)
 		goto fail_local;
@@ -2039,6 +2033,16 @@ fail:
 	return error;
 }
 
+void dlm_lowcomms_init(void)
+{
+	int i;
+
+	for (i = 0; i < CONN_HASH_SIZE; i++)
+		INIT_HLIST_HEAD(&connection_hash[i]);
+
+	INIT_WORK(&listen_con.rwork, process_listen_recv_socket);
+}
+
 void dlm_lowcomms_exit(void)
 {
 	struct dlm_node_addr *na, *safe;
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
index 29369feea991..bbce7a18416d 100644
--- a/fs/dlm/lowcomms.h
+++ b/fs/dlm/lowcomms.h
@@ -35,6 +35,7 @@ extern int dlm_allow_conn;
 int dlm_lowcomms_start(void);
 void dlm_lowcomms_shutdown(void);
 void dlm_lowcomms_stop(void);
+void dlm_lowcomms_init(void);
 void dlm_lowcomms_exit(void);
 int dlm_lowcomms_close(int nodeid);
 struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation,
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
index 1c5be4b70ac1..a77338be3237 100644
--- a/fs/dlm/main.c
+++ b/fs/dlm/main.c
@@ -17,7 +17,7 @@
 #include "user.h"
 #include "memory.h"
 #include "config.h"
-#include "lowcomms.h"
+#include "midcomms.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/dlm.h>
@@ -30,6 +30,8 @@ static int __init init_dlm(void)
 	if (error)
 		goto out;
 
+	dlm_midcomms_init();
+
 	error = dlm_lockspace_init();
 	if (error)
 		goto out_mem;
@@ -66,6 +68,7 @@ static int __init init_dlm(void)
  out_lockspace:
 	dlm_lockspace_exit();
  out_mem:
+	dlm_midcomms_exit();
 	dlm_memory_exit();
  out:
 	return error;
@@ -79,7 +82,7 @@ static void __exit exit_dlm(void)
 	dlm_config_exit();
 	dlm_memory_exit();
 	dlm_lockspace_exit();
-	dlm_lowcomms_exit();
+	dlm_midcomms_exit();
 	dlm_unregister_debugfs();
 }
 
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index 960def5ab530..6b4c72fb0171 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -1205,13 +1205,28 @@ void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh,
 #endif
 
 int dlm_midcomms_start(void)
+{
+	return dlm_lowcomms_start();
+}
+
+void dlm_midcomms_stop(void)
+{
+	dlm_lowcomms_stop();
+}
+
+void dlm_midcomms_init(void)
 {
 	int i;
 
 	for (i = 0; i < CONN_HASH_SIZE; i++)
 		INIT_HLIST_HEAD(&node_hash[i]);
 
-	return dlm_lowcomms_start();
+	dlm_lowcomms_init();
+}
+
+void dlm_midcomms_exit(void)
+{
+	dlm_lowcomms_exit();
 }
 
 static void dlm_act_fin_ack_rcv(struct midcomms_node *node)
diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h
index d6286e80b077..69296552d5ad 100644
--- a/fs/dlm/midcomms.h
+++ b/fs/dlm/midcomms.h
@@ -21,6 +21,9 @@ void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh, const void *name,
 				 int namelen);
 int dlm_midcomms_close(int nodeid);
 int dlm_midcomms_start(void);
+void dlm_midcomms_stop(void);
+void dlm_midcomms_init(void);
+void dlm_midcomms_exit(void);
 void dlm_midcomms_shutdown(void);
 void dlm_midcomms_add_member(int nodeid);
 void dlm_midcomms_remove_member(int nodeid);
-- 
cgit 


From 01ea3d7701cb52dece379384f8aa7b8840f1d7c7 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Thu, 17 Nov 2022 17:11:47 -0500
Subject: fs: dlm: remove twice INIT_WORK

This patch removed a twice INIT_WORK() functionality. We already doing
this inside of dlm_lowcomms_init() functionality which is called only
once dlm is loaded.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 0dcb69cc456a..3106e7f87344 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1825,7 +1825,6 @@ static int dlm_listen_for_all(void)
 	save_listen_callbacks(sock);
 	add_listen_sock(sock, &listen_con);
 
-	INIT_WORK(&listen_con.rwork, process_listen_recv_socket);
 	result = sock->ops->listen(sock, 5);
 	if (result < 0) {
 		dlm_close_sock(&listen_con.sock);
-- 
cgit 


From dd070a56e0fa36f03bcd09fbf1521c733cf2aa21 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Thu, 17 Nov 2022 17:11:48 -0500
Subject: fs: dlm: use list_first_entry_or_null

Instead of check on list_empty() we can do the same with
list_first_entry_or_null() and return NULL if the returned value is NULL.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 3106e7f87344..d3302b10b37e 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -214,15 +214,12 @@ static struct writequeue_entry *con_next_wq(struct connection *con)
 {
 	struct writequeue_entry *e;
 
-	if (list_empty(&con->writequeue))
-		return NULL;
-
-	e = list_first_entry(&con->writequeue, struct writequeue_entry,
-			     list);
+	e = list_first_entry_or_null(&con->writequeue, struct writequeue_entry,
+				     list);
 	/* if len is zero nothing is to send, if there are users filling
 	 * buffers we wait until the users are done so we can send more.
 	 */
-	if (e->users || e->len == 0)
+	if (!e || e->users || e->len == 0)
 		return NULL;
 
 	return e;
-- 
cgit 


From 1037c2a94ab51997d8b1ef9e7f6ed697e6e17d84 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Thu, 17 Nov 2022 17:11:49 -0500
Subject: fs: dlm: use listen sock as dlm running indicator

This patch will switch from dlm_allow_conn to check if dlm lowcomms is
running or not to if we actually have a listen socket set or not. The
list socket will be set and unset in lowcomms start and shutdown
functionality. To synchronize with data_ready() callback we will set the
socket callback to NULL while socket lock is held.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/config.c   |  4 ++--
 fs/dlm/lowcomms.c | 17 ++++++-----------
 fs/dlm/lowcomms.h |  4 ++--
 3 files changed, 10 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index ac8b62106ce0..20b60709eccf 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -183,7 +183,7 @@ static int dlm_check_protocol_and_dlm_running(unsigned int x)
 		return -EINVAL;
 	}
 
-	if (dlm_allow_conn)
+	if (dlm_lowcomms_is_running())
 		return -EBUSY;
 
 	return 0;
@@ -194,7 +194,7 @@ static int dlm_check_zero_and_dlm_running(unsigned int x)
 	if (!x)
 		return -EINVAL;
 
-	if (dlm_allow_conn)
+	if (dlm_lowcomms_is_running())
 		return -EBUSY;
 
 	return 0;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index d3302b10b37e..d9001d40aa6e 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -176,7 +176,6 @@ static DEFINE_SPINLOCK(dlm_node_addrs_spin);
 static struct listen_connection listen_con;
 static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
 static int dlm_local_count;
-int dlm_allow_conn;
 
 /* Work queues */
 static struct workqueue_struct *recv_workqueue;
@@ -191,6 +190,11 @@ static const struct dlm_proto_ops *dlm_proto_ops;
 static void process_recv_sockets(struct work_struct *work);
 static void process_send_sockets(struct work_struct *work);
 
+bool dlm_lowcomms_is_running(void)
+{
+	return !!listen_con.sock;
+}
+
 static void writequeue_entry_ctor(void *data)
 {
 	struct writequeue_entry *entry = data;
@@ -511,9 +515,6 @@ static void lowcomms_data_ready(struct sock *sk)
 
 static void lowcomms_listen_data_ready(struct sock *sk)
 {
-	if (!dlm_allow_conn)
-		return;
-
 	queue_work(recv_workqueue, &listen_con.rwork);
 }
 
@@ -1689,10 +1690,7 @@ void dlm_lowcomms_shutdown(void)
 {
 	int idx;
 
-	/* Set all the flags to prevent any
-	 * socket activity.
-	 */
-	dlm_allow_conn = 0;
+	restore_callbacks(listen_con.sock);
 
 	if (recv_workqueue)
 		flush_workqueue(recv_workqueue);
@@ -1995,8 +1993,6 @@ int dlm_lowcomms_start(void)
 	if (error)
 		goto fail_local;
 
-	dlm_allow_conn = 1;
-
 	/* Start listening */
 	switch (dlm_config.ci_protocol) {
 	case DLM_PROTO_TCP:
@@ -2021,7 +2017,6 @@ int dlm_lowcomms_start(void)
 fail_listen:
 	dlm_proto_ops = NULL;
 fail_proto_ops:
-	dlm_allow_conn = 0;
 	work_stop();
 fail_local:
 	deinit_local();
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
index bbce7a18416d..acaf1b0b3885 100644
--- a/fs/dlm/lowcomms.h
+++ b/fs/dlm/lowcomms.h
@@ -29,8 +29,8 @@ static inline int nodeid_hash(int nodeid)
 	return nodeid & (CONN_HASH_SIZE-1);
 }
 
-/* switch to check if dlm is running */
-extern int dlm_allow_conn;
+/* check if dlm is running */
+bool dlm_lowcomms_is_running(void);
 
 int dlm_lowcomms_start(void);
 void dlm_lowcomms_shutdown(void);
-- 
cgit 


From 4f567acb0b8622fe265aac4b0037a03ef544ba24 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Thu, 17 Nov 2022 17:11:50 -0500
Subject: fs: dlm: remove socket shutdown handling

Since commit 489d8e559c65 ("fs: dlm: add reliable connection if
reconnect") we have functionality like TCP offers for half-closed
sockets on dlm application protocol layer. This feature is required
because the cluster manager events about leaving resource memberships
can be locally already occurred but other cluster nodes having a pending
leaving membership over the cluster manager protocol happening. In this
time the local dlm node already shutdown it's connection and don't
transmit anymore any new dlm messages, but however it still needs to be
able to accept dlm messages because the pending leave membership request
of the cluster manager protocol which the dlm kernel implementation has
no control about it.

We have this functionality on the application for two reasons, the main
reason is that SCTP does not support such functionality on socket
layer. But we can do it inside application layer.

Another small issue is that this feature is broken in the TCP world
because some NAT devices does not implement such functionality
correctly. This is the same reason why the reliable connection session
layer in DLM exists. We give up on middle devices in the networking
which sends e.g. TCP resets out. In DLM we cannot have any message
dropping and we ensure it over a session layer that it can't happen.

Back to the half-closed grace shutdown handling. It's not necessary
anymore to do it on socket layer (which is only support for TCP sockets)
because we do it on application layer. This patch removes this handling,
if there are still issues then we have a problem on the application
layer for such handling.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 127 ++++++++++--------------------------------------------
 fs/dlm/lowcomms.h |   1 +
 fs/dlm/midcomms.c |   6 ++-
 3 files changed, 27 insertions(+), 107 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index d9001d40aa6e..e33bee1beba2 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -65,7 +65,6 @@
 
 /* Number of messages to send before rescheduling */
 #define MAX_SEND_MSG_COUNT 25
-#define DLM_SHUTDOWN_WAIT_TIMEOUT msecs_to_jiffies(10000)
 
 struct connection {
 	struct socket *sock;	/* NULL if not connected */
@@ -79,14 +78,11 @@ struct connection {
 #define CF_CLOSE 6
 #define CF_APP_LIMITED 7
 #define CF_CLOSING 8
-#define CF_SHUTDOWN 9
-#define CF_CONNECTED 10
-#define CF_RECONNECT 11
-#define CF_DELAY_CONNECT 12
-#define CF_EOF 13
+#define CF_CONNECTED 9
+#define CF_RECONNECT 10
+#define CF_DELAY_CONNECT 11
 	struct list_head writequeue;  /* List of outgoing writequeue_entries */
 	spinlock_t writequeue_lock;
-	atomic_t writequeue_cnt;
 	int retries;
 #define MAX_CONNECT_RETRIES 3
 	struct hlist_node list;
@@ -94,7 +90,6 @@ struct connection {
 	struct connection *sendcon;
 	struct work_struct rwork; /* Receive workqueue */
 	struct work_struct swork; /* Send workqueue */
-	wait_queue_head_t shutdown_wait; /* wait for graceful shutdown */
 	unsigned char *rx_buf;
 	int rx_buflen;
 	int rx_leftover;
@@ -157,10 +152,6 @@ struct dlm_proto_ops {
 	int (*listen_validate)(void);
 	void (*listen_sockopts)(struct socket *sock);
 	int (*listen_bind)(struct socket *sock);
-	/* What to do to shutdown */
-	void (*shutdown_action)(struct connection *con);
-	/* What to do to eof check */
-	bool (*eof_condition)(struct connection *con);
 };
 
 static struct listen_sock_callbacks {
@@ -241,11 +232,6 @@ static struct connection *__find_con(int nodeid, int r)
 	return NULL;
 }
 
-static bool tcp_eof_condition(struct connection *con)
-{
-	return atomic_read(&con->writequeue_cnt);
-}
-
 static int dlm_con_init(struct connection *con, int nodeid)
 {
 	con->rx_buflen = dlm_config.ci_buffer_size;
@@ -257,10 +243,8 @@ static int dlm_con_init(struct connection *con, int nodeid)
 	mutex_init(&con->sock_mutex);
 	INIT_LIST_HEAD(&con->writequeue);
 	spin_lock_init(&con->writequeue_lock);
-	atomic_set(&con->writequeue_cnt, 0);
 	INIT_WORK(&con->swork, process_send_sockets);
 	INIT_WORK(&con->rwork, process_recv_sockets);
-	init_waitqueue_head(&con->shutdown_wait);
 
 	return 0;
 }
@@ -771,7 +755,6 @@ static void free_entry(struct writequeue_entry *e)
 	}
 
 	list_del(&e->list);
-	atomic_dec(&e->con->writequeue_cnt);
 	kref_put(&e->ref, dlm_page_release);
 }
 
@@ -834,56 +817,10 @@ static void close_connection(struct connection *con, bool and_other,
 	clear_bit(CF_CONNECTED, &con->flags);
 	clear_bit(CF_DELAY_CONNECT, &con->flags);
 	clear_bit(CF_RECONNECT, &con->flags);
-	clear_bit(CF_EOF, &con->flags);
 	mutex_unlock(&con->sock_mutex);
 	clear_bit(CF_CLOSING, &con->flags);
 }
 
-static void shutdown_connection(struct connection *con)
-{
-	int ret;
-
-	flush_work(&con->swork);
-
-	mutex_lock(&con->sock_mutex);
-	/* nothing to shutdown */
-	if (!con->sock) {
-		mutex_unlock(&con->sock_mutex);
-		return;
-	}
-
-	set_bit(CF_SHUTDOWN, &con->flags);
-	ret = kernel_sock_shutdown(con->sock, SHUT_WR);
-	mutex_unlock(&con->sock_mutex);
-	if (ret) {
-		log_print("Connection %p failed to shutdown: %d will force close",
-			  con, ret);
-		goto force_close;
-	} else {
-		ret = wait_event_timeout(con->shutdown_wait,
-					 !test_bit(CF_SHUTDOWN, &con->flags),
-					 DLM_SHUTDOWN_WAIT_TIMEOUT);
-		if (ret == 0) {
-			log_print("Connection %p shutdown timed out, will force close",
-				  con);
-			goto force_close;
-		}
-	}
-
-	return;
-
-force_close:
-	clear_bit(CF_SHUTDOWN, &con->flags);
-	close_connection(con, false, true, true);
-}
-
-static void dlm_tcp_shutdown(struct connection *con)
-{
-	if (con->othercon)
-		shutdown_connection(con->othercon);
-	shutdown_connection(con);
-}
-
 static int con_realloc_receive_buf(struct connection *con, int newlen)
 {
 	unsigned char *newbuf;
@@ -975,19 +912,8 @@ out_close:
 		log_print("connection %p got EOF from %d",
 			  con, con->nodeid);
 
-		if (dlm_proto_ops->eof_condition &&
-		    dlm_proto_ops->eof_condition(con)) {
-			set_bit(CF_EOF, &con->flags);
-			mutex_unlock(&con->sock_mutex);
-		} else {
-			mutex_unlock(&con->sock_mutex);
-			close_connection(con, false, true, false);
-
-			/* handling for tcp shutdown */
-			clear_bit(CF_SHUTDOWN, &con->flags);
-			wake_up(&con->shutdown_wait);
-		}
-
+		mutex_unlock(&con->sock_mutex);
+		close_connection(con, false, true, false);
 		/* signal to breaking receive worker */
 		ret = -1;
 	} else {
@@ -1261,7 +1187,6 @@ static struct writequeue_entry *new_wq_entry(struct connection *con, int len,
 	kref_get(&e->ref);
 	*ppc = page_address(e->page);
 	e->end += len;
-	atomic_inc(&con->writequeue_cnt);
 	if (cb)
 		cb(data);
 
@@ -1467,20 +1392,6 @@ again:
 	}
 	spin_unlock(&con->writequeue_lock);
 
-	/* close if we got EOF */
-	if (test_and_clear_bit(CF_EOF, &con->flags)) {
-		mutex_unlock(&con->sock_mutex);
-		close_connection(con, false, false, true);
-
-		/* handling for tcp shutdown */
-		clear_bit(CF_SHUTDOWN, &con->flags);
-		wake_up(&con->shutdown_wait);
-	} else {
-		mutex_unlock(&con->sock_mutex);
-	}
-
-	return;
-
 out:
 	mutex_unlock(&con->sock_mutex);
 	return;
@@ -1680,16 +1591,8 @@ static int work_start(void)
 	return 0;
 }
 
-static void shutdown_conn(struct connection *con)
-{
-	if (dlm_proto_ops->shutdown_action)
-		dlm_proto_ops->shutdown_action(con);
-}
-
 void dlm_lowcomms_shutdown(void)
 {
-	int idx;
-
 	restore_callbacks(listen_con.sock);
 
 	if (recv_workqueue)
@@ -1698,9 +1601,25 @@ void dlm_lowcomms_shutdown(void)
 		flush_workqueue(send_workqueue);
 
 	dlm_close_sock(&listen_con.sock);
+}
+
+void dlm_lowcomms_shutdown_node(int nodeid, bool force)
+{
+	struct connection *con;
+	int idx;
 
 	idx = srcu_read_lock(&connections_srcu);
-	foreach_conn(shutdown_conn);
+	con = nodeid2con(nodeid, 0);
+	if (WARN_ON_ONCE(!con)) {
+		srcu_read_unlock(&connections_srcu, idx);
+		return;
+	}
+
+	WARN_ON_ONCE(!force && !list_empty(&con->writequeue));
+	clean_one_writequeue(con);
+	if (con->othercon)
+		clean_one_writequeue(con->othercon);
+	close_connection(con, true, true, true);
 	srcu_read_unlock(&connections_srcu, idx);
 }
 
@@ -1912,8 +1831,6 @@ static const struct dlm_proto_ops dlm_tcp_ops = {
 	.listen_validate = dlm_tcp_listen_validate,
 	.listen_sockopts = dlm_tcp_listen_sockopts,
 	.listen_bind = dlm_tcp_listen_bind,
-	.shutdown_action = dlm_tcp_shutdown,
-	.eof_condition = tcp_eof_condition,
 };
 
 static int dlm_sctp_bind(struct socket *sock)
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
index acaf1b0b3885..3e8dca66183b 100644
--- a/fs/dlm/lowcomms.h
+++ b/fs/dlm/lowcomms.h
@@ -34,6 +34,7 @@ bool dlm_lowcomms_is_running(void);
 
 int dlm_lowcomms_start(void);
 void dlm_lowcomms_shutdown(void);
+void dlm_lowcomms_shutdown_node(int nodeid, bool force);
 void dlm_lowcomms_stop(void);
 void dlm_lowcomms_init(void);
 void dlm_lowcomms_exit(void);
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index 6b4c72fb0171..b0e8bdcaab1b 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -1426,11 +1426,13 @@ static void midcomms_shutdown(struct midcomms_node *node)
 		pr_debug("active shutdown timed out for node %d with state %s\n",
 			 node->nodeid, dlm_state_str(node->state));
 		midcomms_node_reset(node);
+		dlm_lowcomms_shutdown_node(node->nodeid, true);
 		return;
 	}
 
 	pr_debug("active shutdown done for node %d with state %s\n",
 		 node->nodeid, dlm_state_str(node->state));
+	dlm_lowcomms_shutdown_node(node->nodeid, false);
 }
 
 void dlm_midcomms_shutdown(void)
@@ -1438,6 +1440,8 @@ void dlm_midcomms_shutdown(void)
 	struct midcomms_node *node;
 	int i, idx;
 
+	dlm_lowcomms_shutdown();
+
 	mutex_lock(&close_lock);
 	idx = srcu_read_lock(&nodes_srcu);
 	for (i = 0; i < CONN_HASH_SIZE; i++) {
@@ -1455,8 +1459,6 @@ void dlm_midcomms_shutdown(void)
 	}
 	srcu_read_unlock(&nodes_srcu, idx);
 	mutex_unlock(&close_lock);
-
-	dlm_lowcomms_shutdown();
 }
 
 int dlm_midcomms_close(int nodeid)
-- 
cgit 


From c3d88dfd15835f42c79467eee2c40db3095e5271 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Thu, 17 Nov 2022 17:11:51 -0500
Subject: fs: dlm: cleanup listen sock handling

This patch removes save_listen_callbacks() and add_listen_sock() as they
are only used once in lowcomms functionality. For shutdown lowcomms it's
not necessary to whole flush the workqueues to synchronize with
restoring the old sk_data_ready() callback. Only the listen con receive
work need to be cancelled. For each individual node shutdown we should be
sure that last ack was been transmitted which is done by flushing per
connection swork worker.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 51 +++++++++++++++++----------------------------------
 1 file changed, 17 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index e33bee1beba2..8bf09c3a0946 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -647,17 +647,6 @@ out:
 		orig_report(sk);
 }
 
-/* Note: sk_callback_lock must be locked before calling this function. */
-static void save_listen_callbacks(struct socket *sock)
-{
-	struct sock *sk = sock->sk;
-
-	listen_sock.sk_data_ready = sk->sk_data_ready;
-	listen_sock.sk_state_change = sk->sk_state_change;
-	listen_sock.sk_write_space = sk->sk_write_space;
-	listen_sock.sk_error_report = sk->sk_error_report;
-}
-
 static void restore_callbacks(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
@@ -671,21 +660,6 @@ static void restore_callbacks(struct socket *sock)
 	release_sock(sk);
 }
 
-static void add_listen_sock(struct socket *sock, struct listen_connection *con)
-{
-	struct sock *sk = sock->sk;
-
-	lock_sock(sk);
-	save_listen_callbacks(sock);
-	con->sock = sock;
-
-	sk->sk_user_data = con;
-	sk->sk_allocation = GFP_NOFS;
-	/* Install a data_ready callback */
-	sk->sk_data_ready = lowcomms_listen_data_ready;
-	release_sock(sk);
-}
-
 /* Make a socket active */
 static void add_sock(struct socket *sock, struct connection *con)
 {
@@ -1593,13 +1567,12 @@ static int work_start(void)
 
 void dlm_lowcomms_shutdown(void)
 {
-	restore_callbacks(listen_con.sock);
-
-	if (recv_workqueue)
-		flush_workqueue(recv_workqueue);
-	if (send_workqueue)
-		flush_workqueue(send_workqueue);
+	/* stop lowcomms_listen_data_ready calls */
+	lock_sock(listen_con.sock->sk);
+	listen_con.sock->sk->sk_data_ready = listen_sock.sk_data_ready;
+	release_sock(listen_con.sock->sk);
 
+	cancel_work_sync(&listen_con.rwork);
 	dlm_close_sock(&listen_con.sock);
 }
 
@@ -1615,6 +1588,7 @@ void dlm_lowcomms_shutdown_node(int nodeid, bool force)
 		return;
 	}
 
+	flush_work(&con->swork);
 	WARN_ON_ONCE(!force && !list_empty(&con->writequeue));
 	clean_one_writequeue(con);
 	if (con->othercon)
@@ -1736,8 +1710,17 @@ static int dlm_listen_for_all(void)
 	if (result < 0)
 		goto out;
 
-	save_listen_callbacks(sock);
-	add_listen_sock(sock, &listen_con);
+	lock_sock(sock->sk);
+	listen_sock.sk_data_ready = sock->sk->sk_data_ready;
+	listen_sock.sk_write_space = sock->sk->sk_write_space;
+	listen_sock.sk_error_report = sock->sk->sk_error_report;
+	listen_sock.sk_state_change = sock->sk->sk_state_change;
+
+	listen_con.sock = sock;
+
+	sock->sk->sk_allocation = GFP_NOFS;
+	sock->sk->sk_data_ready = lowcomms_listen_data_ready;
+	release_sock(sock->sk);
 
 	result = sock->ops->listen(sock, 5);
 	if (result < 0) {
-- 
cgit 


From c51c9cd8addcfbdc097dbefd59f022402183644b Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Thu, 17 Nov 2022 17:11:52 -0500
Subject: fs: dlm: don't put dlm_local_addrs on heap

This patch removes to allocate the dlm_local_addr[] pointers on the
heap. Instead we directly store the type of "struct sockaddr_storage".
This removes function deinit_local() because it was freeing memory only.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 38 ++++++++++++--------------------------
 1 file changed, 12 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 8bf09c3a0946..0883394cfbeb 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -165,7 +165,7 @@ static LIST_HEAD(dlm_node_addrs);
 static DEFINE_SPINLOCK(dlm_node_addrs_spin);
 
 static struct listen_connection listen_con;
-static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
+static struct sockaddr_storage dlm_local_addr[DLM_MAX_ADDR_COUNT];
 static int dlm_local_count;
 
 /* Work queues */
@@ -383,7 +383,7 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
 	if (!sa_out)
 		return 0;
 
-	if (dlm_local_addr[0]->ss_family == AF_INET) {
+	if (dlm_local_addr[0].ss_family == AF_INET) {
 		struct sockaddr_in *in4  = (struct sockaddr_in *) &sas;
 		struct sockaddr_in *ret4 = (struct sockaddr_in *) sa_out;
 		ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
@@ -683,7 +683,7 @@ static void add_sock(struct socket *sock, struct connection *con)
 static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
 			  int *addr_len)
 {
-	saddr->ss_family =  dlm_local_addr[0]->ss_family;
+	saddr->ss_family =  dlm_local_addr[0].ss_family;
 	if (saddr->ss_family == AF_INET) {
 		struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
 		in4_addr->sin_port = cpu_to_be16(port);
@@ -1065,7 +1065,7 @@ static int sctp_bind_addrs(struct socket *sock, uint16_t port)
 	int i, addr_len, result = 0;
 
 	for (i = 0; i < dlm_local_count; i++) {
-		memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
+		memcpy(&localaddr, &dlm_local_addr[i], sizeof(localaddr));
 		make_sockaddr(&localaddr, port, &addr_len);
 
 		if (!i)
@@ -1085,7 +1085,7 @@ static int sctp_bind_addrs(struct socket *sock, uint16_t port)
 /* Get local addresses */
 static void init_local(void)
 {
-	struct sockaddr_storage sas, *addr;
+	struct sockaddr_storage sas;
 	int i;
 
 	dlm_local_count = 0;
@@ -1093,21 +1093,10 @@ static void init_local(void)
 		if (dlm_our_addr(&sas, i))
 			break;
 
-		addr = kmemdup(&sas, sizeof(*addr), GFP_NOFS);
-		if (!addr)
-			break;
-		dlm_local_addr[dlm_local_count++] = addr;
+		memcpy(&dlm_local_addr[dlm_local_count++], &sas, sizeof(sas));
 	}
 }
 
-static void deinit_local(void)
-{
-	int i;
-
-	for (i = 0; i < dlm_local_count; i++)
-		kfree(dlm_local_addr[i]);
-}
-
 static struct writequeue_entry *new_writequeue_entry(struct connection *con)
 {
 	struct writequeue_entry *entry;
@@ -1463,7 +1452,7 @@ static void dlm_connect(struct connection *con)
 	}
 
 	/* Create a socket to communicate with */
-	result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family,
+	result = sock_create_kern(&init_net, dlm_local_addr[0].ss_family,
 				  SOCK_STREAM, dlm_proto_ops->proto, &sock);
 	if (result < 0)
 		goto socket_err;
@@ -1679,7 +1668,6 @@ void dlm_lowcomms_stop(void)
 	foreach_conn(free_conn);
 	srcu_read_unlock(&connections_srcu, idx);
 	work_stop();
-	deinit_local();
 
 	dlm_proto_ops = NULL;
 }
@@ -1696,7 +1684,7 @@ static int dlm_listen_for_all(void)
 	if (result < 0)
 		return result;
 
-	result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family,
+	result = sock_create_kern(&init_net, dlm_local_addr[0].ss_family,
 				  SOCK_STREAM, dlm_proto_ops->proto, &sock);
 	if (result < 0) {
 		log_print("Can't create comms socket: %d", result);
@@ -1743,7 +1731,7 @@ static int dlm_tcp_bind(struct socket *sock)
 	/* Bind to our cluster-known address connecting to avoid
 	 * routing problems.
 	 */
-	memcpy(&src_addr, dlm_local_addr[0], sizeof(src_addr));
+	memcpy(&src_addr, &dlm_local_addr[0], sizeof(src_addr));
 	make_sockaddr(&src_addr, 0, &addr_len);
 
 	result = sock->ops->bind(sock, (struct sockaddr *)&src_addr,
@@ -1800,8 +1788,8 @@ static int dlm_tcp_listen_bind(struct socket *sock)
 	int addr_len;
 
 	/* Bind to our port */
-	make_sockaddr(dlm_local_addr[0], dlm_config.ci_tcp_port, &addr_len);
-	return sock->ops->bind(sock, (struct sockaddr *)dlm_local_addr[0],
+	make_sockaddr(&dlm_local_addr[0], dlm_config.ci_tcp_port, &addr_len);
+	return sock->ops->bind(sock, (struct sockaddr *)&dlm_local_addr[0],
 			       addr_len);
 }
 
@@ -1891,7 +1879,7 @@ int dlm_lowcomms_start(void)
 
 	error = work_start();
 	if (error)
-		goto fail_local;
+		goto fail;
 
 	/* Start listening */
 	switch (dlm_config.ci_protocol) {
@@ -1918,8 +1906,6 @@ fail_listen:
 	dlm_proto_ops = NULL;
 fail_proto_ops:
 	work_stop();
-fail_local:
-	deinit_local();
 fail:
 	return error;
 }
-- 
cgit 


From 6f0b0b5d7ae70423c94915989c45b29e87c61ad7 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Thu, 17 Nov 2022 17:11:53 -0500
Subject: fs: dlm: remove dlm_node_addrs lookup list

This patch merges the dlm_node_addrs lookup list to the connection
structure. It is a per node mapping to some configuration setup by
configfs. We don't need two lookup structures. The connection hash has
now a lifetime like the dlm_node_addrs entries. Means we add only new
entries when configure cluster and not while new connections are coming
in, remove connection when a node got fenced and cleanup all connection
when the dlm exits. It should work the same and even will show more
issues because we don't try to somehow keep those two data structures in
sync with the current cluster configuration.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 290 +++++++++++++++++++++++++-----------------------------
 1 file changed, 136 insertions(+), 154 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 0883394cfbeb..ed3cd3757199 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -93,6 +93,11 @@ struct connection {
 	unsigned char *rx_buf;
 	int rx_buflen;
 	int rx_leftover;
+	int mark;
+	int addr_count;
+	int curr_addr_index;
+	struct sockaddr_storage addr[DLM_MAX_ADDR_COUNT];
+	spinlock_t addrs_lock;
 	struct rcu_head rcu;
 };
 #define sock2con(x) ((struct connection *)(x)->sk_user_data)
@@ -131,15 +136,6 @@ struct dlm_msg {
 	struct kref ref;
 };
 
-struct dlm_node_addr {
-	struct list_head list;
-	int nodeid;
-	int mark;
-	int addr_count;
-	int curr_addr_index;
-	struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
-};
-
 struct dlm_proto_ops {
 	bool try_new_addr;
 	const char *name;
@@ -161,9 +157,6 @@ static struct listen_sock_callbacks {
 	void (*sk_write_space)(struct sock *);
 } listen_sock;
 
-static LIST_HEAD(dlm_node_addrs);
-static DEFINE_SPINLOCK(dlm_node_addrs_spin);
-
 static struct listen_connection listen_con;
 static struct sockaddr_storage dlm_local_addr[DLM_MAX_ADDR_COUNT];
 static int dlm_local_count;
@@ -306,17 +299,6 @@ static void foreach_conn(void (*conn_func)(struct connection *c))
 	}
 }
 
-static struct dlm_node_addr *find_node_addr(int nodeid)
-{
-	struct dlm_node_addr *na;
-
-	list_for_each_entry(na, &dlm_node_addrs, list) {
-		if (na->nodeid == nodeid)
-			return na;
-	}
-	return NULL;
-}
-
 static int addr_compare(const struct sockaddr_storage *x,
 			const struct sockaddr_storage *y)
 {
@@ -350,38 +332,45 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
 			  unsigned int *mark)
 {
 	struct sockaddr_storage sas;
-	struct dlm_node_addr *na;
+	struct connection *con;
+	int idx;
 
 	if (!dlm_local_count)
 		return -1;
 
-	spin_lock(&dlm_node_addrs_spin);
-	na = find_node_addr(nodeid);
-	if (na && na->addr_count) {
-		memcpy(&sas, na->addr[na->curr_addr_index],
-		       sizeof(struct sockaddr_storage));
+	idx = srcu_read_lock(&connections_srcu);
+	con = nodeid2con(nodeid, 0);
+	if (!con) {
+		srcu_read_unlock(&connections_srcu, idx);
+		return -ENOENT;
+	}
 
-		if (try_new_addr) {
-			na->curr_addr_index++;
-			if (na->curr_addr_index == na->addr_count)
-				na->curr_addr_index = 0;
-		}
+	spin_lock(&con->addrs_lock);
+	if (!con->addr_count) {
+		spin_unlock(&con->addrs_lock);
+		srcu_read_unlock(&connections_srcu, idx);
+		return -ENOENT;
 	}
-	spin_unlock(&dlm_node_addrs_spin);
 
-	if (!na)
-		return -EEXIST;
+	memcpy(&sas, &con->addr[con->curr_addr_index],
+	       sizeof(struct sockaddr_storage));
 
-	if (!na->addr_count)
-		return -ENOENT;
+	if (try_new_addr) {
+		con->curr_addr_index++;
+		if (con->curr_addr_index == con->addr_count)
+			con->curr_addr_index = 0;
+	}
 
-	*mark = na->mark;
+	*mark = con->mark;
+	spin_unlock(&con->addrs_lock);
 
 	if (sas_out)
 		memcpy(sas_out, &sas, sizeof(struct sockaddr_storage));
 
-	if (!sa_out)
+	if (!sa_out) {
+		srcu_read_unlock(&connections_srcu, idx);
 		return 0;
+	}
 
 	if (dlm_local_addr[0].ss_family == AF_INET) {
 		struct sockaddr_in *in4  = (struct sockaddr_in *) &sas;
@@ -393,43 +382,46 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
 		ret6->sin6_addr = in6->sin6_addr;
 	}
 
+	srcu_read_unlock(&connections_srcu, idx);
 	return 0;
 }
 
 static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid,
 			  unsigned int *mark)
 {
-	struct dlm_node_addr *na;
-	int rv = -EEXIST;
-	int addr_i;
-
-	spin_lock(&dlm_node_addrs_spin);
-	list_for_each_entry(na, &dlm_node_addrs, list) {
-		if (!na->addr_count)
-			continue;
-
-		for (addr_i = 0; addr_i < na->addr_count; addr_i++) {
-			if (addr_compare(na->addr[addr_i], addr)) {
-				*nodeid = na->nodeid;
-				*mark = na->mark;
-				rv = 0;
-				goto unlock;
+	struct connection *con;
+	int i, idx, addr_i;
+
+	idx = srcu_read_lock(&connections_srcu);
+	for (i = 0; i < CONN_HASH_SIZE; i++) {
+		hlist_for_each_entry_rcu(con, &connection_hash[i], list) {
+			WARN_ON_ONCE(!con->addr_count);
+
+			spin_lock(&con->addrs_lock);
+			for (addr_i = 0; addr_i < con->addr_count; addr_i++) {
+				if (addr_compare(&con->addr[addr_i], addr)) {
+					*nodeid = con->nodeid;
+					*mark = con->mark;
+					spin_unlock(&con->addrs_lock);
+					srcu_read_unlock(&connections_srcu, idx);
+					return 0;
+				}
 			}
+			spin_unlock(&con->addrs_lock);
 		}
 	}
-unlock:
-	spin_unlock(&dlm_node_addrs_spin);
-	return rv;
+	srcu_read_unlock(&connections_srcu, idx);
+
+	return -ENOENT;
 }
 
-/* caller need to held dlm_node_addrs_spin lock */
-static bool dlm_lowcomms_na_has_addr(const struct dlm_node_addr *na,
-				     const struct sockaddr_storage *addr)
+static bool dlm_lowcomms_con_has_addr(const struct connection *con,
+				      const struct sockaddr_storage *addr)
 {
 	int i;
 
-	for (i = 0; i < na->addr_count; i++) {
-		if (addr_compare(na->addr[i], addr))
+	for (i = 0; i < con->addr_count; i++) {
+		if (addr_compare(&con->addr[i], addr))
 			return true;
 	}
 
@@ -438,52 +430,42 @@ static bool dlm_lowcomms_na_has_addr(const struct dlm_node_addr *na,
 
 int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len)
 {
-	struct sockaddr_storage *new_addr;
-	struct dlm_node_addr *new_node, *na;
-	bool ret;
-
-	new_node = kzalloc(sizeof(struct dlm_node_addr), GFP_NOFS);
-	if (!new_node)
-		return -ENOMEM;
+	struct connection *con;
+	bool ret, idx;
 
-	new_addr = kzalloc(sizeof(struct sockaddr_storage), GFP_NOFS);
-	if (!new_addr) {
-		kfree(new_node);
+	idx = srcu_read_lock(&connections_srcu);
+	con = nodeid2con(nodeid, GFP_NOFS);
+	if (!con) {
+		srcu_read_unlock(&connections_srcu, idx);
 		return -ENOMEM;
 	}
 
-	memcpy(new_addr, addr, len);
-
-	spin_lock(&dlm_node_addrs_spin);
-	na = find_node_addr(nodeid);
-	if (!na) {
-		new_node->nodeid = nodeid;
-		new_node->addr[0] = new_addr;
-		new_node->addr_count = 1;
-		new_node->mark = dlm_config.ci_mark;
-		list_add(&new_node->list, &dlm_node_addrs);
-		spin_unlock(&dlm_node_addrs_spin);
+	spin_lock(&con->addrs_lock);
+	if (!con->addr_count) {
+		memcpy(&con->addr[0], addr, sizeof(*addr));
+		con->addr_count = 1;
+		con->mark = dlm_config.ci_mark;
+		spin_unlock(&con->addrs_lock);
+		srcu_read_unlock(&connections_srcu, idx);
 		return 0;
 	}
 
-	ret = dlm_lowcomms_na_has_addr(na, addr);
+	ret = dlm_lowcomms_con_has_addr(con, addr);
 	if (ret) {
-		spin_unlock(&dlm_node_addrs_spin);
-		kfree(new_addr);
-		kfree(new_node);
+		spin_unlock(&con->addrs_lock);
+		srcu_read_unlock(&connections_srcu, idx);
 		return -EEXIST;
 	}
 
-	if (na->addr_count >= DLM_MAX_ADDR_COUNT) {
-		spin_unlock(&dlm_node_addrs_spin);
-		kfree(new_addr);
-		kfree(new_node);
+	if (con->addr_count >= DLM_MAX_ADDR_COUNT) {
+		spin_unlock(&con->addrs_lock);
+		srcu_read_unlock(&connections_srcu, idx);
 		return -ENOSPC;
 	}
 
-	na->addr[na->addr_count++] = new_addr;
-	spin_unlock(&dlm_node_addrs_spin);
-	kfree(new_node);
+	memcpy(&con->addr[con->addr_count++], addr, sizeof(*addr));
+	srcu_read_unlock(&connections_srcu, idx);
+	spin_unlock(&con->addrs_lock);
 	return 0;
 }
 
@@ -558,10 +540,10 @@ int dlm_lowcomms_connect_node(int nodeid)
 		return 0;
 
 	idx = srcu_read_lock(&connections_srcu);
-	con = nodeid2con(nodeid, GFP_NOFS);
-	if (!con) {
+	con = nodeid2con(nodeid, 0);
+	if (WARN_ON_ONCE(!con)) {
 		srcu_read_unlock(&connections_srcu, idx);
-		return -ENOMEM;
+		return -ENOENT;
 	}
 
 	lowcomms_connect_sock(con);
@@ -572,18 +554,20 @@ int dlm_lowcomms_connect_node(int nodeid)
 
 int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark)
 {
-	struct dlm_node_addr *na;
+	struct connection *con;
+	int idx;
 
-	spin_lock(&dlm_node_addrs_spin);
-	na = find_node_addr(nodeid);
-	if (!na) {
-		spin_unlock(&dlm_node_addrs_spin);
+	idx = srcu_read_lock(&connections_srcu);
+	con = nodeid2con(nodeid, 0);
+	if (!con) {
+		srcu_read_unlock(&connections_srcu, idx);
 		return -ENOENT;
 	}
 
-	na->mark = mark;
-	spin_unlock(&dlm_node_addrs_spin);
-
+	spin_lock(&con->addrs_lock);
+	con->mark = mark;
+	spin_unlock(&con->addrs_lock);
+	srcu_read_unlock(&connections_srcu, idx);
 	return 0;
 }
 
@@ -960,10 +944,10 @@ static int accept_from_sock(struct listen_connection *con)
 	 *  In this case we store the incoming one in "othercon"
 	 */
 	idx = srcu_read_lock(&connections_srcu);
-	newcon = nodeid2con(nodeid, GFP_NOFS);
-	if (!newcon) {
+	newcon = nodeid2con(nodeid, 0);
+	if (WARN_ON_ONCE(!newcon)) {
 		srcu_read_unlock(&connections_srcu, idx);
-		result = -ENOMEM;
+		result = -ENOENT;
 		goto accept_err;
 	}
 
@@ -1210,8 +1194,8 @@ struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation,
 	}
 
 	idx = srcu_read_lock(&connections_srcu);
-	con = nodeid2con(nodeid, allocation);
-	if (!con) {
+	con = nodeid2con(nodeid, 0);
+	if (WARN_ON_ONCE(!con)) {
 		srcu_read_unlock(&connections_srcu, idx);
 		return NULL;
 	}
@@ -1376,35 +1360,44 @@ static void clean_one_writequeue(struct connection *con)
 	spin_unlock(&con->writequeue_lock);
 }
 
+static void connection_release(struct rcu_head *rcu)
+{
+	struct connection *con = container_of(rcu, struct connection, rcu);
+
+	kfree(con->rx_buf);
+	kfree(con);
+}
+
 /* Called from recovery when it knows that a node has
    left the cluster */
 int dlm_lowcomms_close(int nodeid)
 {
 	struct connection *con;
-	struct dlm_node_addr *na;
 	int idx;
 
 	log_print("closing connection to node %d", nodeid);
+
 	idx = srcu_read_lock(&connections_srcu);
 	con = nodeid2con(nodeid, 0);
-	if (con) {
-		set_bit(CF_CLOSE, &con->flags);
-		close_connection(con, true, true, true);
-		clean_one_writequeue(con);
-		if (con->othercon)
-			clean_one_writequeue(con->othercon);
+	if (WARN_ON_ONCE(!con)) {
+		srcu_read_unlock(&connections_srcu, idx);
+		return -ENOENT;
 	}
-	srcu_read_unlock(&connections_srcu, idx);
 
-	spin_lock(&dlm_node_addrs_spin);
-	na = find_node_addr(nodeid);
-	if (na) {
-		list_del(&na->list);
-		while (na->addr_count--)
-			kfree(na->addr[na->addr_count]);
-		kfree(na);
+	spin_lock(&connections_lock);
+	hlist_del_rcu(&con->list);
+	spin_unlock(&connections_lock);
+
+	close_connection(con, true, true, true);
+
+	clean_one_writequeue(con);
+	call_srcu(&connections_srcu, &con->rcu, connection_release);
+	if (con->othercon) {
+		clean_one_writequeue(con->othercon);
+		if (con->othercon)
+			call_srcu(&connections_srcu, &con->othercon->rcu, connection_release);
 	}
-	spin_unlock(&dlm_node_addrs_spin);
+	srcu_read_unlock(&connections_srcu, idx);
 
 	return 0;
 }
@@ -1607,27 +1600,9 @@ static void stop_conn(struct connection *con)
 	_stop_conn(con, true);
 }
 
-static void connection_release(struct rcu_head *rcu)
-{
-	struct connection *con = container_of(rcu, struct connection, rcu);
-
-	kfree(con->rx_buf);
-	kfree(con);
-}
-
 static void free_conn(struct connection *con)
 {
 	close_connection(con, true, true, true);
-	spin_lock(&connections_lock);
-	hlist_del_rcu(&con->list);
-	spin_unlock(&connections_lock);
-	if (con->othercon) {
-		clean_one_writequeue(con->othercon);
-		call_srcu(&connections_srcu, &con->othercon->rcu,
-			  connection_release);
-	}
-	clean_one_writequeue(con);
-	call_srcu(&connections_srcu, &con->rcu, connection_release);
 }
 
 static void work_flush(void)
@@ -1922,14 +1897,21 @@ void dlm_lowcomms_init(void)
 
 void dlm_lowcomms_exit(void)
 {
-	struct dlm_node_addr *na, *safe;
+	struct connection *con;
+	int i, idx;
 
-	spin_lock(&dlm_node_addrs_spin);
-	list_for_each_entry_safe(na, safe, &dlm_node_addrs, list) {
-		list_del(&na->list);
-		while (na->addr_count--)
-			kfree(na->addr[na->addr_count]);
-		kfree(na);
+	idx = srcu_read_lock(&connections_srcu);
+	for (i = 0; i < CONN_HASH_SIZE; i++) {
+		hlist_for_each_entry_rcu(con, &connection_hash[i], list) {
+			spin_lock(&connections_lock);
+			hlist_del_rcu(&con->list);
+			spin_unlock(&connections_lock);
+
+			if (con->othercon)
+				call_srcu(&connections_srcu, &con->othercon->rcu,
+					  connection_release);
+			call_srcu(&connections_srcu, &con->rcu, connection_release);
+		}
 	}
-	spin_unlock(&dlm_node_addrs_spin);
+	srcu_read_unlock(&connections_srcu, idx);
 }
-- 
cgit 


From e9dd5fd849f1ac125919a79eb4be66f078dd7f51 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Thu, 17 Nov 2022 17:11:54 -0500
Subject: fs: dlm: use sock2con without checking null

This patch removes null checks on private data for sockets. If we have a
null dereference there we having a bug in our implementation that such
callback occurs in this state.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index ed3cd3757199..677e31144ca0 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -472,10 +472,9 @@ int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len)
 /* Data available on socket or listen socket received a connect */
 static void lowcomms_data_ready(struct sock *sk)
 {
-	struct connection *con;
+	struct connection *con = sock2con(sk);
 
-	con = sock2con(sk);
-	if (con && !test_and_set_bit(CF_READ_PENDING, &con->flags))
+	if (!test_and_set_bit(CF_READ_PENDING, &con->flags))
 		queue_work(recv_workqueue, &con->rwork);
 }
 
@@ -486,11 +485,7 @@ static void lowcomms_listen_data_ready(struct sock *sk)
 
 static void lowcomms_write_space(struct sock *sk)
 {
-	struct connection *con;
-
-	con = sock2con(sk);
-	if (!con)
-		return;
+	struct connection *con = sock2con(sk);
 
 	if (!test_and_set_bit(CF_CONNECTED, &con->flags)) {
 		log_print("connected to node %d", con->nodeid);
@@ -573,14 +568,10 @@ int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark)
 
 static void lowcomms_error_report(struct sock *sk)
 {
-	struct connection *con;
+	struct connection *con = sock2con(sk);
 	void (*orig_report)(struct sock *) = NULL;
 	struct inet_sock *inet;
 
-	con = sock2con(sk);
-	if (con == NULL)
-		goto out;
-
 	orig_report = listen_sock.sk_error_report;
 
 	inet = inet_sk(sk);
-- 
cgit 


From c852a6d70698d5e4a9f3670ecabf9656ff00c73c Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Thu, 17 Nov 2022 17:11:55 -0500
Subject: fs: dlm: use saved sk_error_report()

This patch changes the handling of calling the original
sk_error_report() by not putting it on the stack and calling it later.
If the listen_sock.sk_error_report() is NULL in this moment it indicates
a bug in our implementation.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 677e31144ca0..643f9810ec35 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -569,11 +569,8 @@ int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark)
 static void lowcomms_error_report(struct sock *sk)
 {
 	struct connection *con = sock2con(sk);
-	void (*orig_report)(struct sock *) = NULL;
 	struct inet_sock *inet;
 
-	orig_report = listen_sock.sk_error_report;
-
 	inet = inet_sk(sk);
 	switch (sk->sk_family) {
 	case AF_INET:
@@ -618,8 +615,7 @@ static void lowcomms_error_report(struct sock *sk)
 		queue_work(send_workqueue, &con->swork);
 
 out:
-	if (orig_report)
-		orig_report(sk);
+	listen_sock.sk_error_report(sk);
 }
 
 static void restore_callbacks(struct socket *sock)
-- 
cgit 


From 1351975ac1377225cef5d858971e17252c06ff51 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Thu, 17 Nov 2022 17:11:56 -0500
Subject: fs: dlm: don't init error value

This patch removes a init of an error value to -EINVAL which is not
necessary.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 643f9810ec35..c6b91f67a2c2 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1830,7 +1830,7 @@ static const struct dlm_proto_ops dlm_sctp_ops = {
 
 int dlm_lowcomms_start(void)
 {
-	int error = -EINVAL;
+	int error;
 
 	init_local();
 	if (!dlm_local_count) {
-- 
cgit 


From dbb751ffab0b764720e360efd642ba6bf076d87f Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Thu, 17 Nov 2022 17:11:57 -0500
Subject: fs: dlm: parallelize lowcomms socket handling

This patch is rework of lowcomms handling, the main goal was here to
handle recvmsg() and sendpage() to run parallel. Parallel in two senses:
1. per connection and 2. that recvmsg()/sendpage() doesn't block each
other.

Currently recvmsg()/sendpage() cannot run parallel because two
workqueues "dlm_recv" and "dlm_send" are ordered workqueues. That means
only one work item can be executed. The amount of queue items will be
increased about the amount of nodes being inside the cluster. The current
two workqueues for sending and receiving can also block each other if the
same connection is executed at the same time in dlm_recv and dlm_send
workqueue because a per connection mutex for the socket handling.

To make it more parallel we introduce one "dlm_io" workqueue which is
not an ordered workqueue, the amount of workers are not limited. Due
per connection flags SEND/RECV pending we schedule workers ordered per
connection and per send and receive task. To get rid of the mutex
blocking same workers to do socket handling we switched to a semaphore
which handles socket operations as read lock and sock releases as write
operations, to prevent sock_release() being called while the socket is
being used.

There might be more optimization removing the semaphore and replacing it
with other synchronization mechanism, however due other circumstances
e.g. othercon behaviour it seems complicated to doing this change. I
added comments to remove the othercon handling and moving to a different
synchronization mechanism as this is done. We need to do that to the next
dlm major version upgrade because it is not backwards compatible with the
current connect mechanism.

The processing of dlm messages need to be still handled by a ordered
workqueue. An dlm_process ordered workqueue was introduced which gets
filled by the receive worker. This is probably the next bottleneck of
DLM but the application can't currently parse dlm messages parallel. A
comment was introduced to lift the workqueue context of dlm processing
in a non-sleepable softirq to get messages processing done fast.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 1024 +++++++++++++++++++++++++++++------------------------
 fs/dlm/midcomms.c |   45 ++-
 fs/dlm/midcomms.h |    1 +
 3 files changed, 586 insertions(+), 484 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index c6b91f67a2c2..799d1c36eabf 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -63,35 +63,43 @@
 
 #define NEEDED_RMEM (4*1024*1024)
 
-/* Number of messages to send before rescheduling */
-#define MAX_SEND_MSG_COUNT 25
-
 struct connection {
 	struct socket *sock;	/* NULL if not connected */
 	uint32_t nodeid;	/* So we know who we are in the list */
-	struct mutex sock_mutex;
+	/* this semaphore is used to allow parallel recv/send in read
+	 * lock mode. When we release a sock we need to held the write lock.
+	 *
+	 * However this is locking code and not nice. When we remove the
+	 * othercon handling we can look into other mechanism to synchronize
+	 * io handling to call sock_release() at the right time.
+	 */
+	struct rw_semaphore sock_lock;
 	unsigned long flags;
-#define CF_READ_PENDING 1
-#define CF_WRITE_PENDING 2
-#define CF_INIT_PENDING 4
+#define CF_APP_LIMITED 0
+#define CF_RECV_PENDING 1
+#define CF_SEND_PENDING 2
+#define CF_RECV_INTR 3
+#define CF_IO_STOP 4
 #define CF_IS_OTHERCON 5
-#define CF_CLOSE 6
-#define CF_APP_LIMITED 7
-#define CF_CLOSING 8
-#define CF_CONNECTED 9
-#define CF_RECONNECT 10
-#define CF_DELAY_CONNECT 11
 	struct list_head writequeue;  /* List of outgoing writequeue_entries */
 	spinlock_t writequeue_lock;
 	int retries;
-#define MAX_CONNECT_RETRIES 3
 	struct hlist_node list;
+	/* due some connect()/accept() races we currently have this cross over
+	 * connection attempt second connection for one node.
+	 *
+	 * There is a solution to avoid the race by introducing a connect
+	 * rule as e.g. our_nodeid > nodeid_to_connect who is allowed to
+	 * connect. Otherside can connect but will only be considered that
+	 * the other side wants to have a reconnect.
+	 *
+	 * However changing to this behaviour will break backwards compatible.
+	 * In a DLM protocol major version upgrade we should remove this!
+	 */
 	struct connection *othercon;
-	struct connection *sendcon;
-	struct work_struct rwork; /* Receive workqueue */
-	struct work_struct swork; /* Send workqueue */
-	unsigned char *rx_buf;
-	int rx_buflen;
+	struct work_struct rwork; /* receive worker */
+	struct work_struct swork; /* send worker */
+	unsigned char rx_leftover_buf[DLM_MAX_SOCKET_BUFSIZE];
 	int rx_leftover;
 	int mark;
 	int addr_count;
@@ -136,6 +144,14 @@ struct dlm_msg {
 	struct kref ref;
 };
 
+struct processqueue_entry {
+	unsigned char *buf;
+	int nodeid;
+	int buflen;
+
+	struct list_head list;
+};
+
 struct dlm_proto_ops {
 	bool try_new_addr;
 	const char *name;
@@ -162,8 +178,8 @@ static struct sockaddr_storage dlm_local_addr[DLM_MAX_ADDR_COUNT];
 static int dlm_local_count;
 
 /* Work queues */
-static struct workqueue_struct *recv_workqueue;
-static struct workqueue_struct *send_workqueue;
+static struct workqueue_struct *io_workqueue;
+static struct workqueue_struct *process_workqueue;
 
 static struct hlist_head connection_hash[CONN_HASH_SIZE];
 static DEFINE_SPINLOCK(connections_lock);
@@ -171,14 +187,44 @@ DEFINE_STATIC_SRCU(connections_srcu);
 
 static const struct dlm_proto_ops *dlm_proto_ops;
 
+#define DLM_IO_SUCCESS 0
+#define DLM_IO_END 1
+#define DLM_IO_EOF 2
+#define DLM_IO_RESCHED 3
+
 static void process_recv_sockets(struct work_struct *work);
 static void process_send_sockets(struct work_struct *work);
+static void process_dlm_messages(struct work_struct *work);
+
+static DECLARE_WORK(process_work, process_dlm_messages);
+static DEFINE_SPINLOCK(processqueue_lock);
+static bool process_dlm_messages_pending;
+static LIST_HEAD(processqueue);
 
 bool dlm_lowcomms_is_running(void)
 {
 	return !!listen_con.sock;
 }
 
+static void lowcomms_queue_swork(struct connection *con)
+{
+	WARN_ON_ONCE(!lockdep_is_held(&con->writequeue_lock));
+
+	if (!test_bit(CF_IO_STOP, &con->flags) &&
+	    !test_bit(CF_APP_LIMITED, &con->flags) &&
+	    !test_and_set_bit(CF_SEND_PENDING, &con->flags))
+		queue_work(io_workqueue, &con->swork);
+}
+
+static void lowcomms_queue_rwork(struct connection *con)
+{
+	WARN_ON_ONCE(!lockdep_sock_is_held(con->sock->sk));
+
+	if (!test_bit(CF_IO_STOP, &con->flags) &&
+	    !test_and_set_bit(CF_RECV_PENDING, &con->flags))
+		queue_work(io_workqueue, &con->rwork);
+}
+
 static void writequeue_entry_ctor(void *data)
 {
 	struct writequeue_entry *entry = data;
@@ -225,21 +271,15 @@ static struct connection *__find_con(int nodeid, int r)
 	return NULL;
 }
 
-static int dlm_con_init(struct connection *con, int nodeid)
+static void dlm_con_init(struct connection *con, int nodeid)
 {
-	con->rx_buflen = dlm_config.ci_buffer_size;
-	con->rx_buf = kmalloc(con->rx_buflen, GFP_NOFS);
-	if (!con->rx_buf)
-		return -ENOMEM;
-
 	con->nodeid = nodeid;
-	mutex_init(&con->sock_mutex);
+	init_rwsem(&con->sock_lock);
 	INIT_LIST_HEAD(&con->writequeue);
 	spin_lock_init(&con->writequeue_lock);
 	INIT_WORK(&con->swork, process_send_sockets);
 	INIT_WORK(&con->rwork, process_recv_sockets);
-
-	return 0;
+	spin_lock_init(&con->addrs_lock);
 }
 
 /*
@@ -249,7 +289,7 @@ static int dlm_con_init(struct connection *con, int nodeid)
 static struct connection *nodeid2con(int nodeid, gfp_t alloc)
 {
 	struct connection *con, *tmp;
-	int r, ret;
+	int r;
 
 	r = nodeid_hash(nodeid);
 	con = __find_con(nodeid, r);
@@ -260,11 +300,7 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc)
 	if (!con)
 		return NULL;
 
-	ret = dlm_con_init(con, nodeid);
-	if (ret) {
-		kfree(con);
-		return NULL;
-	}
+	dlm_con_init(con, nodeid);
 
 	spin_lock(&connections_lock);
 	/* Because multiple workqueues/threads calls this function it can
@@ -276,7 +312,6 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc)
 	tmp = __find_con(nodeid, r);
 	if (tmp) {
 		spin_unlock(&connections_lock);
-		kfree(con->rx_buf);
 		kfree(con);
 		return tmp;
 	}
@@ -287,18 +322,6 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc)
 	return con;
 }
 
-/* Loop round all connections */
-static void foreach_conn(void (*conn_func)(struct connection *c))
-{
-	int i;
-	struct connection *con;
-
-	for (i = 0; i < CONN_HASH_SIZE; i++) {
-		hlist_for_each_entry_rcu(con, &connection_hash[i], list)
-			conn_func(con);
-	}
-}
-
 static int addr_compare(const struct sockaddr_storage *x,
 			const struct sockaddr_storage *y)
 {
@@ -474,56 +497,38 @@ static void lowcomms_data_ready(struct sock *sk)
 {
 	struct connection *con = sock2con(sk);
 
-	if (!test_and_set_bit(CF_READ_PENDING, &con->flags))
-		queue_work(recv_workqueue, &con->rwork);
-}
-
-static void lowcomms_listen_data_ready(struct sock *sk)
-{
-	queue_work(recv_workqueue, &listen_con.rwork);
+	set_bit(CF_RECV_INTR, &con->flags);
+	lowcomms_queue_rwork(con);
 }
 
 static void lowcomms_write_space(struct sock *sk)
 {
 	struct connection *con = sock2con(sk);
 
-	if (!test_and_set_bit(CF_CONNECTED, &con->flags)) {
-		log_print("connected to node %d", con->nodeid);
-		queue_work(send_workqueue, &con->swork);
-		return;
-	}
-
 	clear_bit(SOCK_NOSPACE, &con->sock->flags);
 
+	spin_lock_bh(&con->writequeue_lock);
 	if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) {
 		con->sock->sk->sk_write_pending--;
 		clear_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags);
 	}
 
-	queue_work(send_workqueue, &con->swork);
-}
-
-static inline void lowcomms_connect_sock(struct connection *con)
-{
-	if (test_bit(CF_CLOSE, &con->flags))
-		return;
-	queue_work(send_workqueue, &con->swork);
-	cond_resched();
+	lowcomms_queue_swork(con);
+	spin_unlock_bh(&con->writequeue_lock);
 }
 
 static void lowcomms_state_change(struct sock *sk)
 {
 	/* SCTP layer is not calling sk_data_ready when the connection
-	 * is done, so we catch the signal through here. Also, it
-	 * doesn't switch socket state when entering shutdown, so we
-	 * skip the write in that case.
+	 * is done, so we catch the signal through here.
 	 */
-	if (sk->sk_shutdown) {
-		if (sk->sk_shutdown == RCV_SHUTDOWN)
-			lowcomms_data_ready(sk);
-	} else if (sk->sk_state == TCP_ESTABLISHED) {
-		lowcomms_write_space(sk);
-	}
+	if (sk->sk_shutdown == RCV_SHUTDOWN)
+		lowcomms_data_ready(sk);
+}
+
+static void lowcomms_listen_data_ready(struct sock *sk)
+{
+	queue_work(io_workqueue, &listen_con.rwork);
 }
 
 int dlm_lowcomms_connect_node(int nodeid)
@@ -541,9 +546,16 @@ int dlm_lowcomms_connect_node(int nodeid)
 		return -ENOENT;
 	}
 
-	lowcomms_connect_sock(con);
+	down_read(&con->sock_lock);
+	if (!con->sock) {
+		spin_lock_bh(&con->writequeue_lock);
+		lowcomms_queue_swork(con);
+		spin_unlock_bh(&con->writequeue_lock);
+	}
+	up_read(&con->sock_lock);
 	srcu_read_unlock(&connections_srcu, idx);
 
+	cond_resched();
 	return 0;
 }
 
@@ -596,39 +608,23 @@ static void lowcomms_error_report(struct sock *sk)
 				   "invalid socket family %d set, "
 				   "sk_err=%d/%d\n", dlm_our_nodeid(),
 				   sk->sk_family, sk->sk_err, sk->sk_err_soft);
-		goto out;
-	}
-
-	/* below sendcon only handling */
-	if (test_bit(CF_IS_OTHERCON, &con->flags))
-		con = con->sendcon;
-
-	switch (sk->sk_err) {
-	case ECONNREFUSED:
-		set_bit(CF_DELAY_CONNECT, &con->flags);
-		break;
-	default:
 		break;
 	}
 
-	if (!test_and_set_bit(CF_RECONNECT, &con->flags))
-		queue_work(send_workqueue, &con->swork);
+	dlm_midcomms_unack_msg_resend(con->nodeid);
 
-out:
 	listen_sock.sk_error_report(sk);
 }
 
-static void restore_callbacks(struct socket *sock)
+static void restore_callbacks(struct sock *sk)
 {
-	struct sock *sk = sock->sk;
+	WARN_ON_ONCE(!lockdep_sock_is_held(sk));
 
-	lock_sock(sk);
 	sk->sk_user_data = NULL;
 	sk->sk_data_ready = listen_sock.sk_data_ready;
 	sk->sk_state_change = listen_sock.sk_state_change;
 	sk->sk_write_space = listen_sock.sk_write_space;
 	sk->sk_error_report = listen_sock.sk_error_report;
-	release_sock(sk);
 }
 
 /* Make a socket active */
@@ -640,10 +636,10 @@ static void add_sock(struct socket *sock, struct connection *con)
 	con->sock = sock;
 
 	sk->sk_user_data = con;
-	/* Install a data_ready callback */
 	sk->sk_data_ready = lowcomms_data_ready;
 	sk->sk_write_space = lowcomms_write_space;
-	sk->sk_state_change = lowcomms_state_change;
+	if (dlm_config.ci_protocol == DLM_PROTO_SCTP)
+		sk->sk_state_change = lowcomms_state_change;
 	sk->sk_allocation = GFP_NOFS;
 	sk->sk_error_report = lowcomms_error_report;
 	release_sock(sk);
@@ -705,37 +701,62 @@ static void free_entry(struct writequeue_entry *e)
 
 static void dlm_close_sock(struct socket **sock)
 {
-	if (*sock) {
-		restore_callbacks(*sock);
-		sock_release(*sock);
-		*sock = NULL;
+	lock_sock((*sock)->sk);
+	restore_callbacks((*sock)->sk);
+	release_sock((*sock)->sk);
+
+	sock_release(*sock);
+	*sock = NULL;
+}
+
+static void allow_connection_io(struct connection *con)
+{
+	if (con->othercon)
+		clear_bit(CF_IO_STOP, &con->othercon->flags);
+	clear_bit(CF_IO_STOP, &con->flags);
+}
+
+static void stop_connection_io(struct connection *con)
+{
+	if (con->othercon)
+		stop_connection_io(con->othercon);
+
+	down_write(&con->sock_lock);
+	if (con->sock) {
+		lock_sock(con->sock->sk);
+		restore_callbacks(con->sock->sk);
+
+		spin_lock_bh(&con->writequeue_lock);
+		set_bit(CF_IO_STOP, &con->flags);
+		spin_unlock_bh(&con->writequeue_lock);
+		release_sock(con->sock->sk);
+	} else {
+		spin_lock_bh(&con->writequeue_lock);
+		set_bit(CF_IO_STOP, &con->flags);
+		spin_unlock_bh(&con->writequeue_lock);
 	}
+	up_write(&con->sock_lock);
+
+	cancel_work_sync(&con->swork);
+	cancel_work_sync(&con->rwork);
 }
 
 /* Close a remote connection and tidy up */
-static void close_connection(struct connection *con, bool and_other,
-			     bool tx, bool rx)
+static void close_connection(struct connection *con, bool and_other)
 {
-	bool closing = test_and_set_bit(CF_CLOSING, &con->flags);
 	struct writequeue_entry *e;
 
-	if (tx && !closing && cancel_work_sync(&con->swork)) {
-		log_print("canceled swork for node %d", con->nodeid);
-		clear_bit(CF_WRITE_PENDING, &con->flags);
-	}
-	if (rx && !closing && cancel_work_sync(&con->rwork)) {
-		log_print("canceled rwork for node %d", con->nodeid);
-		clear_bit(CF_READ_PENDING, &con->flags);
+	if (con->othercon && and_other)
+		close_connection(con->othercon, false);
+
+	down_write(&con->sock_lock);
+	if (!con->sock) {
+		up_write(&con->sock_lock);
+		return;
 	}
 
-	mutex_lock(&con->sock_mutex);
 	dlm_close_sock(&con->sock);
 
-	if (con->othercon && and_other) {
-		/* Will only re-enter once. */
-		close_connection(con->othercon, false, tx, rx);
-	}
-
 	/* if we send a writequeue entry only a half way, we drop the
 	 * whole entry because reconnection and that we not start of the
 	 * middle of a msg which will confuse the other end.
@@ -747,143 +768,209 @@ static void close_connection(struct connection *con, bool and_other,
 	 * our policy is to start on a clean state when disconnects, we don't
 	 * know what's send/received on transport layer in this case.
 	 */
-	spin_lock(&con->writequeue_lock);
+	spin_lock_bh(&con->writequeue_lock);
 	if (!list_empty(&con->writequeue)) {
 		e = list_first_entry(&con->writequeue, struct writequeue_entry,
 				     list);
 		if (e->dirty)
 			free_entry(e);
 	}
-	spin_unlock(&con->writequeue_lock);
+	spin_unlock_bh(&con->writequeue_lock);
 
 	con->rx_leftover = 0;
 	con->retries = 0;
 	clear_bit(CF_APP_LIMITED, &con->flags);
-	clear_bit(CF_CONNECTED, &con->flags);
-	clear_bit(CF_DELAY_CONNECT, &con->flags);
-	clear_bit(CF_RECONNECT, &con->flags);
-	mutex_unlock(&con->sock_mutex);
-	clear_bit(CF_CLOSING, &con->flags);
+	clear_bit(CF_RECV_PENDING, &con->flags);
+	clear_bit(CF_SEND_PENDING, &con->flags);
+	up_write(&con->sock_lock);
 }
 
-static int con_realloc_receive_buf(struct connection *con, int newlen)
+static struct processqueue_entry *new_processqueue_entry(int nodeid,
+							 int buflen)
 {
-	unsigned char *newbuf;
-
-	newbuf = kmalloc(newlen, GFP_NOFS);
-	if (!newbuf)
-		return -ENOMEM;
+	struct processqueue_entry *pentry;
 
-	/* copy any leftover from last receive */
-	if (con->rx_leftover)
-		memmove(newbuf, con->rx_buf, con->rx_leftover);
+	pentry = kmalloc(sizeof(*pentry), GFP_NOFS);
+	if (!pentry)
+		return NULL;
 
-	/* swap to new buffer space */
-	kfree(con->rx_buf);
-	con->rx_buflen = newlen;
-	con->rx_buf = newbuf;
+	pentry->buf = kmalloc(buflen, GFP_NOFS);
+	if (!pentry->buf) {
+		kfree(pentry);
+		return NULL;
+	}
 
-	return 0;
+	pentry->nodeid = nodeid;
+	return pentry;
 }
 
-/* Data received from remote end */
-static int receive_from_sock(struct connection *con)
+static void free_processqueue_entry(struct processqueue_entry *pentry)
 {
-	struct msghdr msg;
-	struct kvec iov;
-	int ret, buflen;
+	kfree(pentry->buf);
+	kfree(pentry);
+}
+
+struct dlm_processed_nodes {
+	int nodeid;
 
-	mutex_lock(&con->sock_mutex);
+	struct list_head list;
+};
 
-	if (con->sock == NULL) {
-		ret = -EAGAIN;
-		goto out_close;
+static void add_processed_node(int nodeid, struct list_head *processed_nodes)
+{
+	struct dlm_processed_nodes *n;
+
+	list_for_each_entry(n, processed_nodes, list) {
+		/* we already remembered this node */
+		if (n->nodeid == nodeid)
+			return;
 	}
 
-	/* realloc if we get new buffer size to read out */
-	buflen = dlm_config.ci_buffer_size;
-	if (con->rx_buflen != buflen && con->rx_leftover <= buflen) {
-		ret = con_realloc_receive_buf(con, buflen);
-		if (ret < 0)
-			goto out_resched;
+	/* if it's fails in worst case we simple don't send an ack back.
+	 * We try it next time.
+	 */
+	n = kmalloc(sizeof(*n), GFP_NOFS);
+	if (!n)
+		return;
+
+	n->nodeid = nodeid;
+	list_add(&n->list, processed_nodes);
+}
+
+static void process_dlm_messages(struct work_struct *work)
+{
+	struct dlm_processed_nodes *n, *n_tmp;
+	struct processqueue_entry *pentry;
+	LIST_HEAD(processed_nodes);
+
+	spin_lock(&processqueue_lock);
+	pentry = list_first_entry_or_null(&processqueue,
+					  struct processqueue_entry, list);
+	if (WARN_ON_ONCE(!pentry)) {
+		spin_unlock(&processqueue_lock);
+		return;
 	}
 
+	list_del(&pentry->list);
+	spin_unlock(&processqueue_lock);
+
 	for (;;) {
-		/* calculate new buffer parameter regarding last receive and
-		 * possible leftover bytes
-		 */
-		iov.iov_base = con->rx_buf + con->rx_leftover;
-		iov.iov_len = con->rx_buflen - con->rx_leftover;
-
-		memset(&msg, 0, sizeof(msg));
-		msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
-		ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len,
-				     msg.msg_flags);
-		trace_dlm_recv(con->nodeid, ret);
-		if (ret == -EAGAIN)
+		dlm_process_incoming_buffer(pentry->nodeid, pentry->buf,
+					    pentry->buflen);
+		add_processed_node(pentry->nodeid, &processed_nodes);
+		free_processqueue_entry(pentry);
+
+		spin_lock(&processqueue_lock);
+		pentry = list_first_entry_or_null(&processqueue,
+						  struct processqueue_entry, list);
+		if (!pentry) {
+			process_dlm_messages_pending = false;
+			spin_unlock(&processqueue_lock);
 			break;
-		else if (ret <= 0)
-			goto out_close;
-
-		/* new buflen according readed bytes and leftover from last receive */
-		buflen = ret + con->rx_leftover;
-		ret = dlm_process_incoming_buffer(con->nodeid, con->rx_buf, buflen);
-		if (ret < 0)
-			goto out_close;
-
-		/* calculate leftover bytes from process and put it into begin of
-		 * the receive buffer, so next receive we have the full message
-		 * at the start address of the receive buffer.
-		 */
-		con->rx_leftover = buflen - ret;
-		if (con->rx_leftover) {
-			memmove(con->rx_buf, con->rx_buf + ret,
-				con->rx_leftover);
 		}
+
+		list_del(&pentry->list);
+		spin_unlock(&processqueue_lock);
+	}
+
+	/* send ack back after we processed couple of messages */
+	list_for_each_entry_safe(n, n_tmp, &processed_nodes, list) {
+		list_del(&n->list);
+		dlm_midcomms_receive_done(n->nodeid);
+		kfree(n);
+	}
+}
+
+/* Data received from remote end */
+static int receive_from_sock(struct connection *con, int buflen)
+{
+	struct processqueue_entry *pentry;
+	int ret, buflen_real;
+	struct msghdr msg;
+	struct kvec iov;
+
+	pentry = new_processqueue_entry(con->nodeid, buflen);
+	if (!pentry)
+		return DLM_IO_RESCHED;
+
+	memcpy(pentry->buf, con->rx_leftover_buf, con->rx_leftover);
+
+	/* calculate new buffer parameter regarding last receive and
+	 * possible leftover bytes
+	 */
+	iov.iov_base = pentry->buf + con->rx_leftover;
+	iov.iov_len = buflen - con->rx_leftover;
+
+	memset(&msg, 0, sizeof(msg));
+	msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
+	clear_bit(CF_RECV_INTR, &con->flags);
+again:
+	ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len,
+			     msg.msg_flags);
+	trace_dlm_recv(con->nodeid, ret);
+	if (ret == -EAGAIN) {
+		lock_sock(con->sock->sk);
+		if (test_and_clear_bit(CF_RECV_INTR, &con->flags)) {
+			release_sock(con->sock->sk);
+			goto again;
+		}
+
+		clear_bit(CF_RECV_PENDING, &con->flags);
+		release_sock(con->sock->sk);
+		free_processqueue_entry(pentry);
+		return DLM_IO_END;
+	} else if (ret == 0) {
+		/* close will clear CF_RECV_PENDING */
+		free_processqueue_entry(pentry);
+		return DLM_IO_EOF;
+	} else if (ret < 0) {
+		free_processqueue_entry(pentry);
+		return ret;
 	}
 
-	dlm_midcomms_receive_done(con->nodeid);
-	mutex_unlock(&con->sock_mutex);
-	return 0;
+	/* new buflen according readed bytes and leftover from last receive */
+	buflen_real = ret + con->rx_leftover;
+	ret = dlm_validate_incoming_buffer(con->nodeid, pentry->buf,
+					   buflen_real);
+	if (ret < 0) {
+		free_processqueue_entry(pentry);
+		return ret;
+	}
 
-out_resched:
-	if (!test_and_set_bit(CF_READ_PENDING, &con->flags))
-		queue_work(recv_workqueue, &con->rwork);
-	mutex_unlock(&con->sock_mutex);
-	return -EAGAIN;
-
-out_close:
-	if (ret == 0) {
-		log_print("connection %p got EOF from %d",
-			  con, con->nodeid);
-
-		mutex_unlock(&con->sock_mutex);
-		close_connection(con, false, true, false);
-		/* signal to breaking receive worker */
-		ret = -1;
-	} else {
-		mutex_unlock(&con->sock_mutex);
+	pentry->buflen = ret;
+
+	/* calculate leftover bytes from process and put it into begin of
+	 * the receive buffer, so next receive we have the full message
+	 * at the start address of the receive buffer.
+	 */
+	con->rx_leftover = buflen_real - ret;
+	memmove(con->rx_leftover_buf, pentry->buf + ret,
+		con->rx_leftover);
+
+	spin_lock(&processqueue_lock);
+	list_add_tail(&pentry->list, &processqueue);
+	if (!process_dlm_messages_pending) {
+		process_dlm_messages_pending = true;
+		queue_work(process_workqueue, &process_work);
 	}
-	return ret;
+	spin_unlock(&processqueue_lock);
+
+	return DLM_IO_SUCCESS;
 }
 
 /* Listening socket is busy, accept a connection */
-static int accept_from_sock(struct listen_connection *con)
+static int accept_from_sock(void)
 {
-	int result;
 	struct sockaddr_storage peeraddr;
-	struct socket *newsock;
-	int len, idx;
-	int nodeid;
+	int len, idx, result, nodeid;
 	struct connection *newcon;
-	struct connection *addcon;
+	struct socket *newsock;
 	unsigned int mark;
 
-	if (!con->sock)
-		return -ENOTCONN;
-
-	result = kernel_accept(con->sock, &newsock, O_NONBLOCK);
-	if (result < 0)
+	result = kernel_accept(listen_con.sock, &newsock, O_NONBLOCK);
+	if (result == -EAGAIN)
+		return DLM_IO_END;
+	else if (result < 0)
 		goto accept_err;
 
 	/* Get the connected socket's peer */
@@ -940,7 +1027,7 @@ static int accept_from_sock(struct listen_connection *con)
 
 	sock_set_mark(newsock->sk, mark);
 
-	mutex_lock(&newcon->sock_mutex);
+	down_write(&newcon->sock_lock);
 	if (newcon->sock) {
 		struct connection *othercon = newcon->othercon;
 
@@ -948,63 +1035,50 @@ static int accept_from_sock(struct listen_connection *con)
 			othercon = kzalloc(sizeof(*othercon), GFP_NOFS);
 			if (!othercon) {
 				log_print("failed to allocate incoming socket");
-				mutex_unlock(&newcon->sock_mutex);
+				up_write(&newcon->sock_lock);
 				srcu_read_unlock(&connections_srcu, idx);
 				result = -ENOMEM;
 				goto accept_err;
 			}
 
-			result = dlm_con_init(othercon, nodeid);
-			if (result < 0) {
-				kfree(othercon);
-				mutex_unlock(&newcon->sock_mutex);
-				srcu_read_unlock(&connections_srcu, idx);
-				goto accept_err;
-			}
-
-			lockdep_set_subclass(&othercon->sock_mutex, 1);
-			set_bit(CF_IS_OTHERCON, &othercon->flags);
+			dlm_con_init(othercon, nodeid);
+			lockdep_set_subclass(&othercon->sock_lock, 1);
 			newcon->othercon = othercon;
-			othercon->sendcon = newcon;
+			set_bit(CF_IS_OTHERCON, &othercon->flags);
 		} else {
 			/* close other sock con if we have something new */
-			close_connection(othercon, false, true, false);
+			close_connection(othercon, false);
 		}
 
-		mutex_lock(&othercon->sock_mutex);
+		down_write(&othercon->sock_lock);
 		add_sock(newsock, othercon);
-		addcon = othercon;
-		mutex_unlock(&othercon->sock_mutex);
+
+		/* check if we receved something while adding */
+		lock_sock(othercon->sock->sk);
+		lowcomms_queue_rwork(othercon);
+		release_sock(othercon->sock->sk);
+		up_write(&othercon->sock_lock);
 	}
 	else {
 		/* accept copies the sk after we've saved the callbacks, so we
 		   don't want to save them a second time or comm errors will
 		   result in calling sk_error_report recursively. */
 		add_sock(newsock, newcon);
-		addcon = newcon;
-	}
-
-	set_bit(CF_CONNECTED, &addcon->flags);
-	mutex_unlock(&newcon->sock_mutex);
-
-	/*
-	 * Add it to the active queue in case we got data
-	 * between processing the accept adding the socket
-	 * to the read_sockets list
-	 */
-	if (!test_and_set_bit(CF_READ_PENDING, &addcon->flags))
-		queue_work(recv_workqueue, &addcon->rwork);
 
+		/* check if we receved something while adding */
+		lock_sock(newcon->sock->sk);
+		lowcomms_queue_rwork(newcon);
+		release_sock(newcon->sock->sk);
+	}
+	up_write(&newcon->sock_lock);
 	srcu_read_unlock(&connections_srcu, idx);
 
-	return 0;
+	return DLM_IO_SUCCESS;
 
 accept_err:
 	if (newsock)
 		sock_release(newsock);
 
-	if (result != -EAGAIN)
-		log_print("error accepting connection from node: %d", result);
 	return result;
 }
 
@@ -1098,7 +1172,7 @@ static struct writequeue_entry *new_wq_entry(struct connection *con, int len,
 {
 	struct writequeue_entry *e;
 
-	spin_lock(&con->writequeue_lock);
+	spin_lock_bh(&con->writequeue_lock);
 	if (!list_empty(&con->writequeue)) {
 		e = list_last_entry(&con->writequeue, struct writequeue_entry, list);
 		if (DLM_WQ_REMAIN_BYTES(e) >= len) {
@@ -1127,7 +1201,7 @@ static struct writequeue_entry *new_wq_entry(struct connection *con, int len,
 	list_add_tail(&e->list, &con->writequeue);
 
 out:
-	spin_unlock(&con->writequeue_lock);
+	spin_unlock_bh(&con->writequeue_lock);
 	return e;
 };
 
@@ -1176,7 +1250,7 @@ struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation,
 	    len < sizeof(struct dlm_header)) {
 		BUILD_BUG_ON(PAGE_SIZE < DLM_MAX_SOCKET_BUFSIZE);
 		log_print("failed to allocate a buffer of size %d", len);
-		WARN_ON(1);
+		WARN_ON_ONCE(1);
 		return NULL;
 	}
 
@@ -1207,7 +1281,7 @@ static void _dlm_lowcomms_commit_msg(struct dlm_msg *msg)
 	struct connection *con = e->con;
 	int users;
 
-	spin_lock(&con->writequeue_lock);
+	spin_lock_bh(&con->writequeue_lock);
 	kref_get(&msg->ref);
 	list_add(&msg->list, &e->msgs);
 
@@ -1216,13 +1290,11 @@ static void _dlm_lowcomms_commit_msg(struct dlm_msg *msg)
 		goto out;
 
 	e->len = DLM_WQ_LENGTH_BYTES(e);
-	spin_unlock(&con->writequeue_lock);
 
-	queue_work(send_workqueue, &con->swork);
-	return;
+	lowcomms_queue_swork(con);
 
 out:
-	spin_unlock(&con->writequeue_lock);
+	spin_unlock_bh(&con->writequeue_lock);
 	return;
 }
 
@@ -1244,7 +1316,7 @@ void dlm_lowcomms_put_msg(struct dlm_msg *msg)
 	kref_put(&msg->ref, dlm_msg_release);
 }
 
-/* does not held connections_srcu, usage workqueue only */
+/* does not held connections_srcu, usage lowcomms_error_report only */
 int dlm_lowcomms_resend_msg(struct dlm_msg *msg)
 {
 	struct dlm_msg *msg_resend;
@@ -1270,88 +1342,78 @@ int dlm_lowcomms_resend_msg(struct dlm_msg *msg)
 }
 
 /* Send a message */
-static void send_to_sock(struct connection *con)
+static int send_to_sock(struct connection *con)
 {
 	const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
 	struct writequeue_entry *e;
 	int len, offset, ret;
-	int count;
-
-again:
-	count = 0;
 
-	mutex_lock(&con->sock_mutex);
-	if (con->sock == NULL)
-		goto out_connect;
-
-	spin_lock(&con->writequeue_lock);
-	for (;;) {
-		e = con_next_wq(con);
-		if (!e)
-			break;
-
-		len = e->len;
-		offset = e->offset;
-		BUG_ON(len == 0 && e->users == 0);
-		spin_unlock(&con->writequeue_lock);
-
-		ret = kernel_sendpage(con->sock, e->page, offset, len,
-				      msg_flags);
-		trace_dlm_send(con->nodeid, ret);
-		if (ret == -EAGAIN || ret == 0) {
-			if (ret == -EAGAIN &&
-			    test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) &&
-			    !test_and_set_bit(CF_APP_LIMITED, &con->flags)) {
-				/* Notify TCP that we're limited by the
-				 * application window size.
-				 */
-				set_bit(SOCK_NOSPACE, &con->sock->flags);
-				con->sock->sk->sk_write_pending++;
-			}
-			cond_resched();
-			goto out;
-		} else if (ret < 0)
-			goto out;
+	spin_lock_bh(&con->writequeue_lock);
+	e = con_next_wq(con);
+	if (!e) {
+		clear_bit(CF_SEND_PENDING, &con->flags);
+		spin_unlock_bh(&con->writequeue_lock);
+		return DLM_IO_END;
+	}
 
-		spin_lock(&con->writequeue_lock);
-		writequeue_entry_complete(e, ret);
+	len = e->len;
+	offset = e->offset;
+	WARN_ON_ONCE(len == 0 && e->users == 0);
+	spin_unlock_bh(&con->writequeue_lock);
 
-		/* Don't starve people filling buffers */
-		if (++count >= MAX_SEND_MSG_COUNT) {
-			spin_unlock(&con->writequeue_lock);
-			mutex_unlock(&con->sock_mutex);
-			cond_resched();
-			goto again;
+	ret = kernel_sendpage(con->sock, e->page, offset, len,
+			      msg_flags);
+	trace_dlm_send(con->nodeid, ret);
+	if (ret == -EAGAIN || ret == 0) {
+		lock_sock(con->sock->sk);
+		spin_lock_bh(&con->writequeue_lock);
+		if (test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) &&
+		    !test_and_set_bit(CF_APP_LIMITED, &con->flags)) {
+			/* Notify TCP that we're limited by the
+			 * application window size.
+			 */
+			set_bit(SOCK_NOSPACE, &con->sock->sk->sk_socket->flags);
+			con->sock->sk->sk_write_pending++;
+
+			clear_bit(CF_SEND_PENDING, &con->flags);
+			spin_unlock_bh(&con->writequeue_lock);
+			release_sock(con->sock->sk);
+
+			/* wait for write_space() event */
+			return DLM_IO_END;
 		}
+		spin_unlock_bh(&con->writequeue_lock);
+		release_sock(con->sock->sk);
+
+		return DLM_IO_RESCHED;
+	} else if (ret < 0) {
+		return ret;
 	}
-	spin_unlock(&con->writequeue_lock);
 
-out:
-	mutex_unlock(&con->sock_mutex);
-	return;
+	spin_lock_bh(&con->writequeue_lock);
+	writequeue_entry_complete(e, ret);
+	spin_unlock_bh(&con->writequeue_lock);
 
-out_connect:
-	mutex_unlock(&con->sock_mutex);
-	queue_work(send_workqueue, &con->swork);
-	cond_resched();
+	return DLM_IO_SUCCESS;
 }
 
 static void clean_one_writequeue(struct connection *con)
 {
 	struct writequeue_entry *e, *safe;
 
-	spin_lock(&con->writequeue_lock);
+	spin_lock_bh(&con->writequeue_lock);
 	list_for_each_entry_safe(e, safe, &con->writequeue, list) {
 		free_entry(e);
 	}
-	spin_unlock(&con->writequeue_lock);
+	spin_unlock_bh(&con->writequeue_lock);
 }
 
 static void connection_release(struct rcu_head *rcu)
 {
 	struct connection *con = container_of(rcu, struct connection, rcu);
 
-	kfree(con->rx_buf);
+	WARN_ON_ONCE(!list_empty(&con->writequeue));
+	WARN_ON_ONCE(con->sock);
 	kfree(con);
 }
 
@@ -1371,12 +1433,14 @@ int dlm_lowcomms_close(int nodeid)
 		return -ENOENT;
 	}
 
+	stop_connection_io(con);
+	log_print("io handling for node: %d stopped", nodeid);
+	close_connection(con, true);
+
 	spin_lock(&connections_lock);
 	hlist_del_rcu(&con->list);
 	spin_unlock(&connections_lock);
 
-	close_connection(con, true, true, true);
-
 	clean_one_writequeue(con);
 	call_srcu(&connections_srcu, &con->rcu, connection_release);
 	if (con->othercon) {
@@ -1386,148 +1450,238 @@ int dlm_lowcomms_close(int nodeid)
 	}
 	srcu_read_unlock(&connections_srcu, idx);
 
+	/* for debugging we print when we are done to compare with other
+	 * messages in between. This function need to be correctly synchronized
+	 * with io handling
+	 */
+	log_print("closing connection to node %d done", nodeid);
+
 	return 0;
 }
 
-/* Receive workqueue function */
+/* Receive worker function */
 static void process_recv_sockets(struct work_struct *work)
 {
 	struct connection *con = container_of(work, struct connection, rwork);
+	int ret, buflen;
 
-	clear_bit(CF_READ_PENDING, &con->flags);
-	receive_from_sock(con);
+	down_read(&con->sock_lock);
+	if (!con->sock) {
+		up_read(&con->sock_lock);
+		return;
+	}
+
+	buflen = READ_ONCE(dlm_config.ci_buffer_size);
+	do {
+		ret = receive_from_sock(con, buflen);
+	} while (ret == DLM_IO_SUCCESS);
+	up_read(&con->sock_lock);
+
+	switch (ret) {
+	case DLM_IO_END:
+		/* CF_RECV_PENDING cleared */
+		break;
+	case DLM_IO_EOF:
+		close_connection(con, false);
+		/* CF_RECV_PENDING cleared */
+		break;
+	case DLM_IO_RESCHED:
+		cond_resched();
+		queue_work(io_workqueue, &con->rwork);
+		/* CF_RECV_PENDING not cleared */
+		break;
+	default:
+		if (ret < 0) {
+			if (test_bit(CF_IS_OTHERCON, &con->flags)) {
+				close_connection(con, false);
+			} else {
+				spin_lock_bh(&con->writequeue_lock);
+				lowcomms_queue_swork(con);
+				spin_unlock_bh(&con->writequeue_lock);
+			}
+
+			/* CF_RECV_PENDING cleared for othercon
+			 * we trigger send queue if not already done
+			 * and process_send_sockets will handle it
+			 */
+			break;
+		}
+
+		WARN_ON_ONCE(1);
+		break;
+	}
 }
 
 static void process_listen_recv_socket(struct work_struct *work)
 {
 	int ret;
 
+	if (WARN_ON_ONCE(!listen_con.sock))
+		return;
+
 	do {
-		ret = accept_from_sock(&listen_con);
-	} while (!ret);
+		ret = accept_from_sock();
+	} while (ret == DLM_IO_SUCCESS);
+
+	if (ret < 0)
+		log_print("critical error accepting connection: %d", ret);
 }
 
-static void dlm_connect(struct connection *con)
+static int dlm_connect(struct connection *con)
 {
 	struct sockaddr_storage addr;
 	int result, addr_len;
 	struct socket *sock;
 	unsigned int mark;
 
-	/* Some odd races can cause double-connects, ignore them */
-	if (con->retries++ > MAX_CONNECT_RETRIES)
-		return;
-
-	if (con->sock) {
-		log_print("node %d already connected.", con->nodeid);
-		return;
-	}
-
 	memset(&addr, 0, sizeof(addr));
 	result = nodeid_to_addr(con->nodeid, &addr, NULL,
 				dlm_proto_ops->try_new_addr, &mark);
 	if (result < 0) {
 		log_print("no address for nodeid %d", con->nodeid);
-		return;
+		return result;
 	}
 
 	/* Create a socket to communicate with */
 	result = sock_create_kern(&init_net, dlm_local_addr[0].ss_family,
 				  SOCK_STREAM, dlm_proto_ops->proto, &sock);
 	if (result < 0)
-		goto socket_err;
+		return result;
 
 	sock_set_mark(sock->sk, mark);
 	dlm_proto_ops->sockopts(sock);
 
-	add_sock(sock, con);
-
 	result = dlm_proto_ops->bind(sock);
-	if (result < 0)
-		goto add_sock_err;
+	if (result < 0) {
+		sock_release(sock);
+		return result;
+	}
+
+	add_sock(sock, con);
 
 	log_print_ratelimited("connecting to %d", con->nodeid);
 	make_sockaddr(&addr, dlm_config.ci_tcp_port, &addr_len);
 	result = dlm_proto_ops->connect(con, sock, (struct sockaddr *)&addr,
 					addr_len);
-	if (result < 0)
-		goto add_sock_err;
-
-	return;
-
-add_sock_err:
-	dlm_close_sock(&con->sock);
+	switch (result) {
+	case -EINPROGRESS:
+		/* not an error */
+		fallthrough;
+	case 0:
+		break;
+	default:
+		if (result < 0)
+			dlm_close_sock(&con->sock);
 
-socket_err:
-	/*
-	 * Some errors are fatal and this list might need adjusting. For other
-	 * errors we try again until the max number of retries is reached.
-	 */
-	if (result != -EHOSTUNREACH &&
-	    result != -ENETUNREACH &&
-	    result != -ENETDOWN &&
-	    result != -EINVAL &&
-	    result != -EPROTONOSUPPORT) {
-		log_print("connect %d try %d error %d", con->nodeid,
-			  con->retries, result);
-		msleep(1000);
-		lowcomms_connect_sock(con);
+		break;
 	}
+
+	return result;
 }
 
-/* Send workqueue function */
+/* Send worker function */
 static void process_send_sockets(struct work_struct *work)
 {
 	struct connection *con = container_of(work, struct connection, swork);
+	int ret;
 
-	WARN_ON(test_bit(CF_IS_OTHERCON, &con->flags));
+	WARN_ON_ONCE(test_bit(CF_IS_OTHERCON, &con->flags));
+
+	down_read(&con->sock_lock);
+	if (!con->sock) {
+		up_read(&con->sock_lock);
+		down_write(&con->sock_lock);
+		if (!con->sock) {
+			ret = dlm_connect(con);
+			switch (ret) {
+			case 0:
+				break;
+			case -EINPROGRESS:
+				/* avoid spamming resched on connection
+				 * we might can switch to a state_change
+				 * event based mechanism if established
+				 */
+				msleep(100);
+				break;
+			default:
+				/* CF_SEND_PENDING not cleared */
+				up_write(&con->sock_lock);
+				log_print("connect to node %d try %d error %d",
+					  con->nodeid, con->retries++, ret);
+				msleep(1000);
+				/* For now we try forever to reconnect. In
+				 * future we should send a event to cluster
+				 * manager to fence itself after certain amount
+				 * of retries.
+				 */
+				queue_work(io_workqueue, &con->swork);
+				return;
+			}
+		}
+		downgrade_write(&con->sock_lock);
+	}
 
-	clear_bit(CF_WRITE_PENDING, &con->flags);
+	do {
+		ret = send_to_sock(con);
+	} while (ret == DLM_IO_SUCCESS);
+	up_read(&con->sock_lock);
 
-	if (test_and_clear_bit(CF_RECONNECT, &con->flags)) {
-		close_connection(con, false, false, true);
-		dlm_midcomms_unack_msg_resend(con->nodeid);
-	}
+	switch (ret) {
+	case DLM_IO_END:
+		/* CF_SEND_PENDING cleared */
+		break;
+	case DLM_IO_RESCHED:
+		/* CF_SEND_PENDING not cleared */
+		cond_resched();
+		queue_work(io_workqueue, &con->swork);
+		break;
+	default:
+		if (ret < 0) {
+			close_connection(con, false);
 
-	if (con->sock == NULL) {
-		if (test_and_clear_bit(CF_DELAY_CONNECT, &con->flags))
-			msleep(1000);
+			/* CF_SEND_PENDING cleared */
+			spin_lock_bh(&con->writequeue_lock);
+			lowcomms_queue_swork(con);
+			spin_unlock_bh(&con->writequeue_lock);
+			break;
+		}
 
-		mutex_lock(&con->sock_mutex);
-		dlm_connect(con);
-		mutex_unlock(&con->sock_mutex);
+		WARN_ON_ONCE(1);
+		break;
 	}
-
-	if (!list_empty(&con->writequeue))
-		send_to_sock(con);
 }
 
 static void work_stop(void)
 {
-	if (recv_workqueue) {
-		destroy_workqueue(recv_workqueue);
-		recv_workqueue = NULL;
+	if (io_workqueue) {
+		destroy_workqueue(io_workqueue);
+		io_workqueue = NULL;
 	}
 
-	if (send_workqueue) {
-		destroy_workqueue(send_workqueue);
-		send_workqueue = NULL;
+	if (process_workqueue) {
+		destroy_workqueue(process_workqueue);
+		process_workqueue = NULL;
 	}
 }
 
 static int work_start(void)
 {
-	recv_workqueue = alloc_ordered_workqueue("dlm_recv", WQ_MEM_RECLAIM);
-	if (!recv_workqueue) {
-		log_print("can't start dlm_recv");
+	io_workqueue = alloc_workqueue("dlm_io", WQ_HIGHPRI | WQ_MEM_RECLAIM,
+				       0);
+	if (!io_workqueue) {
+		log_print("can't start dlm_io");
 		return -ENOMEM;
 	}
 
-	send_workqueue = alloc_ordered_workqueue("dlm_send", WQ_MEM_RECLAIM);
-	if (!send_workqueue) {
-		log_print("can't start dlm_send");
-		destroy_workqueue(recv_workqueue);
-		recv_workqueue = NULL;
+	/* ordered dlm message process queue,
+	 * should be converted to a tasklet
+	 */
+	process_workqueue = alloc_ordered_workqueue("dlm_process",
+						    WQ_HIGHPRI | WQ_MEM_RECLAIM);
+	if (!process_workqueue) {
+		log_print("can't start dlm_process");
+		destroy_workqueue(io_workqueue);
+		io_workqueue = NULL;
 		return -ENOMEM;
 	}
 
@@ -1543,6 +1697,8 @@ void dlm_lowcomms_shutdown(void)
 
 	cancel_work_sync(&listen_con.rwork);
 	dlm_close_sock(&listen_con.sock);
+
+	flush_workqueue(process_workqueue);
 }
 
 void dlm_lowcomms_shutdown_node(int nodeid, bool force)
@@ -1558,79 +1714,19 @@ void dlm_lowcomms_shutdown_node(int nodeid, bool force)
 	}
 
 	flush_work(&con->swork);
+	stop_connection_io(con);
 	WARN_ON_ONCE(!force && !list_empty(&con->writequeue));
+	close_connection(con, true);
 	clean_one_writequeue(con);
 	if (con->othercon)
 		clean_one_writequeue(con->othercon);
-	close_connection(con, true, true, true);
+	allow_connection_io(con);
 	srcu_read_unlock(&connections_srcu, idx);
 }
 
-static void _stop_conn(struct connection *con, bool and_other)
-{
-	mutex_lock(&con->sock_mutex);
-	set_bit(CF_CLOSE, &con->flags);
-	set_bit(CF_READ_PENDING, &con->flags);
-	set_bit(CF_WRITE_PENDING, &con->flags);
-	if (con->sock && con->sock->sk) {
-		lock_sock(con->sock->sk);
-		con->sock->sk->sk_user_data = NULL;
-		release_sock(con->sock->sk);
-	}
-	if (con->othercon && and_other)
-		_stop_conn(con->othercon, false);
-	mutex_unlock(&con->sock_mutex);
-}
-
-static void stop_conn(struct connection *con)
-{
-	_stop_conn(con, true);
-}
-
-static void free_conn(struct connection *con)
-{
-	close_connection(con, true, true, true);
-}
-
-static void work_flush(void)
-{
-	int ok;
-	int i;
-	struct connection *con;
-
-	do {
-		ok = 1;
-		foreach_conn(stop_conn);
-		if (recv_workqueue)
-			flush_workqueue(recv_workqueue);
-		if (send_workqueue)
-			flush_workqueue(send_workqueue);
-		for (i = 0; i < CONN_HASH_SIZE && ok; i++) {
-			hlist_for_each_entry_rcu(con, &connection_hash[i],
-						 list) {
-				ok &= test_bit(CF_READ_PENDING, &con->flags);
-				ok &= test_bit(CF_WRITE_PENDING, &con->flags);
-				if (con->othercon) {
-					ok &= test_bit(CF_READ_PENDING,
-						       &con->othercon->flags);
-					ok &= test_bit(CF_WRITE_PENDING,
-						       &con->othercon->flags);
-				}
-			}
-		}
-	} while (!ok);
-}
-
 void dlm_lowcomms_stop(void)
 {
-	int idx;
-
-	idx = srcu_read_lock(&connections_srcu);
-	work_flush();
-	foreach_conn(free_conn);
-	srcu_read_unlock(&connections_srcu, idx);
 	work_stop();
-
 	dlm_proto_ops = NULL;
 }
 
@@ -1709,17 +1805,7 @@ static int dlm_tcp_bind(struct socket *sock)
 static int dlm_tcp_connect(struct connection *con, struct socket *sock,
 			   struct sockaddr *addr, int addr_len)
 {
-	int ret;
-
-	ret = sock->ops->connect(sock, addr, addr_len, O_NONBLOCK);
-	switch (ret) {
-	case -EINPROGRESS:
-		fallthrough;
-	case 0:
-		return 0;
-	}
-
-	return ret;
+	return sock->ops->connect(sock, addr, addr_len, O_NONBLOCK);
 }
 
 static int dlm_tcp_listen_validate(void)
@@ -1784,13 +1870,7 @@ static int dlm_sctp_connect(struct connection *con, struct socket *sock,
 	sock_set_sndtimeo(sock->sk, 5);
 	ret = sock->ops->connect(sock, addr, addr_len, 0);
 	sock_set_sndtimeo(sock->sk, 0);
-	if (ret < 0)
-		return ret;
-
-	if (!test_and_set_bit(CF_CONNECTED, &con->flags))
-		log_print("connected to node %d", con->nodeid);
-
-	return 0;
+	return ret;
 }
 
 static int dlm_sctp_listen_validate(void)
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index b0e8bdcaab1b..fc015a6abe17 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -306,11 +306,11 @@ static void dlm_send_queue_flush(struct midcomms_node *node)
 	pr_debug("flush midcomms send queue of node %d\n", node->nodeid);
 
 	rcu_read_lock();
-	spin_lock(&node->send_queue_lock);
+	spin_lock_bh(&node->send_queue_lock);
 	list_for_each_entry_rcu(mh, &node->send_queue, list) {
 		dlm_mhandle_delete(node, mh);
 	}
-	spin_unlock(&node->send_queue_lock);
+	spin_unlock_bh(&node->send_queue_lock);
 	rcu_read_unlock();
 }
 
@@ -437,7 +437,7 @@ static void dlm_receive_ack(struct midcomms_node *node, uint32_t seq)
 		}
 	}
 
-	spin_lock(&node->send_queue_lock);
+	spin_lock_bh(&node->send_queue_lock);
 	list_for_each_entry_rcu(mh, &node->send_queue, list) {
 		if (before(mh->seq, seq)) {
 			dlm_mhandle_delete(node, mh);
@@ -446,7 +446,7 @@ static void dlm_receive_ack(struct midcomms_node *node, uint32_t seq)
 			break;
 		}
 	}
-	spin_unlock(&node->send_queue_lock);
+	spin_unlock_bh(&node->send_queue_lock);
 	rcu_read_unlock();
 }
 
@@ -890,12 +890,7 @@ static void dlm_midcomms_receive_buffer_3_1(union dlm_packet *p, int nodeid)
 	dlm_receive_buffer(p, nodeid);
 }
 
-/*
- * Called from the low-level comms layer to process a buffer of
- * commands.
- */
-
-int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len)
+int dlm_validate_incoming_buffer(int nodeid, unsigned char *buf, int len)
 {
 	const unsigned char *ptr = buf;
 	const struct dlm_header *hd;
@@ -930,6 +925,32 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len)
 		if (msglen > len)
 			break;
 
+		ret += msglen;
+		len -= msglen;
+		ptr += msglen;
+	}
+
+	return ret;
+}
+
+/*
+ * Called from the low-level comms layer to process a buffer of
+ * commands.
+ */
+int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len)
+{
+	const unsigned char *ptr = buf;
+	const struct dlm_header *hd;
+	uint16_t msglen;
+	int ret = 0;
+
+	while (len >= sizeof(struct dlm_header)) {
+		hd = (struct dlm_header *)ptr;
+
+		msglen = le16_to_cpu(hd->h_length);
+		if (msglen > len)
+			break;
+
 		switch (hd->h_version) {
 		case cpu_to_le32(DLM_VERSION_3_1):
 			dlm_midcomms_receive_buffer_3_1((union dlm_packet *)ptr, nodeid);
@@ -1046,9 +1067,9 @@ static void midcomms_new_msg_cb(void *data)
 
 	atomic_inc(&mh->node->send_queue_cnt);
 
-	spin_lock(&mh->node->send_queue_lock);
+	spin_lock_bh(&mh->node->send_queue_lock);
 	list_add_tail_rcu(&mh->list, &mh->node->send_queue);
-	spin_unlock(&mh->node->send_queue_lock);
+	spin_unlock_bh(&mh->node->send_queue_lock);
 
 	mh->seq = mh->node->seq_send++;
 }
diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h
index 69296552d5ad..bea1cee4279c 100644
--- a/fs/dlm/midcomms.h
+++ b/fs/dlm/midcomms.h
@@ -14,6 +14,7 @@
 
 struct midcomms_node;
 
+int dlm_validate_incoming_buffer(int nodeid, unsigned char *buf, int len);
 int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int buflen);
 struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len,
 					     gfp_t allocation, char **ppc);
-- 
cgit 


From 502487847743018c93d75b401eac2ea4c4973123 Mon Sep 17 00:00:00 2001
From: ChenXiaoSong <chenxiaosong2@huawei.com>
Date: Sat, 19 Nov 2022 12:51:59 +0800
Subject: cifs: fix missing unlock in cifs_file_copychunk_range()

xfstests generic/013 and generic/476 reported WARNING as follows:

  WARNING: lock held when returning to user space!
  6.1.0-rc5+ #4 Not tainted
  ------------------------------------------------
  fsstress/504233 is leaving the kernel with locks still held!
  2 locks held by fsstress/504233:
   #0: ffff888054c38850 (&sb->s_type->i_mutex_key#21){+.+.}-{3:3}, at:
                        lock_two_nondirectories+0xcf/0xf0
   #1: ffff8880b8fec750 (&sb->s_type->i_mutex_key#21/4){+.+.}-{3:3}, at:
                        lock_two_nondirectories+0xb7/0xf0

This will lead to deadlock and hungtask.

Fix this by releasing locks when failed to write out on a file range in
cifs_file_copychunk_range().

Fixes: 3e3761f1ec7d ("smb3: use filemap_write_and_wait_range instead of filemap_write_and_wait")
Cc: stable@vger.kernel.org # 6.0
Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz>
Signed-off-by: ChenXiaoSong <chenxiaosong2@huawei.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/cifs/cifsfs.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index fe220686bba4..712a43161448 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1281,7 +1281,7 @@ ssize_t cifs_file_copychunk_range(unsigned int xid,
 	rc = filemap_write_and_wait_range(src_inode->i_mapping, off,
 					  off + len - 1);
 	if (rc)
-		goto out;
+		goto unlock;
 
 	/* should we flush first and last page first */
 	truncate_inode_pages(&target_inode->i_data, 0);
@@ -1297,6 +1297,8 @@ ssize_t cifs_file_copychunk_range(unsigned int xid,
 	 * that target is updated on the server
 	 */
 	CIFS_I(target_inode)->time = 0;
+
+unlock:
 	/* although unlocking in the reverse order from locking is not
 	 * strictly necessary here it is a little cleaner to be consistent
 	 */
-- 
cgit 


From 28b4b0596343d19d140da059eee0e5c2b5328731 Mon Sep 17 00:00:00 2001
From: Long Li <leo.lilong@huawei.com>
Date: Thu, 17 Nov 2022 13:02:56 -0800
Subject: xfs: fix incorrect i_nlink caused by inode racing

The following error occurred during the fsstress test:

XFS: Assertion failed: VFS_I(ip)->i_nlink >= 2, file: fs/xfs/xfs_inode.c, line: 2452

The problem was that inode race condition causes incorrect i_nlink to be
written to disk, and then it is read into memory. Consider the following
call graph, inodes that are marked as both XFS_IFLUSHING and
XFS_IRECLAIMABLE, i_nlink will be reset to 1 and then restored to original
value in xfs_reinit_inode(). Therefore, the i_nlink of directory on disk
may be set to 1.

  xfsaild
      xfs_inode_item_push
          xfs_iflush_cluster
              xfs_iflush
                  xfs_inode_to_disk

  xfs_iget
      xfs_iget_cache_hit
          xfs_iget_recycle
              xfs_reinit_inode
                  inode_init_always

xfs_reinit_inode() needs to hold the ILOCK_EXCL as it is changing internal
inode state and can race with other RCU protected inode lookups. On the
read side, xfs_iflush_cluster() grabs the ILOCK_SHARED while under rcu +
ip->i_flags_lock, and so xfs_iflush/xfs_inode_to_disk() are protected from
racing inode updates (during transactions) by that lock.

Fixes: ff7bebeb91f8 ("xfs: refactor the inode recycling code") # goes further back than this
Signed-off-by: Long Li <leo.lilong@huawei.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_icache.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index eae7427062cf..f35e2cee5265 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -342,6 +342,9 @@ xfs_iget_recycle(
 
 	trace_xfs_iget_recycle(ip);
 
+	if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
+		return -EAGAIN;
+
 	/*
 	 * We need to make it look like the inode is being reclaimed to prevent
 	 * the actual reclaim workers from stomping over us while we recycle
@@ -355,6 +358,7 @@ xfs_iget_recycle(
 
 	ASSERT(!rwsem_is_locked(&inode->i_rwsem));
 	error = xfs_reinit_inode(mp, inode);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	if (error) {
 		/*
 		 * Re-initializing the inode failed, and we are in deep
@@ -518,6 +522,8 @@ xfs_iget_cache_hit(
 	if (ip->i_flags & XFS_IRECLAIMABLE) {
 		/* Drops i_flags_lock and RCU read lock. */
 		error = xfs_iget_recycle(pag, ip);
+		if (error == -EAGAIN)
+			goto out_skip;
 		if (error)
 			return error;
 	} else {
-- 
cgit 


From fce3caea0f241f5d34855c82c399d5e0e2d91f07 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 14 Nov 2022 05:29:42 +0100
Subject: blk-crypto: don't use struct request_queue for public interfaces

Switch all public blk-crypto interfaces to use struct block_device
arguments to specify the device they operate on instead of th
request_queue, which is a block layer implementation detail.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20221114042944.1009870-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/crypto/inline_crypt.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index cea8b14007e6..55c4d8c23d30 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -139,8 +139,7 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci)
 		return PTR_ERR(devs);
 
 	for (i = 0; i < num_devs; i++) {
-		if (!blk_crypto_config_supported(bdev_get_queue(devs[i]),
-						 &crypto_cfg))
+		if (!blk_crypto_config_supported(devs[i], &crypto_cfg))
 			goto out_free_devs;
 	}
 
@@ -184,8 +183,7 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
 		goto fail;
 	}
 	for (i = 0; i < num_devs; i++) {
-		err = blk_crypto_start_using_key(blk_key,
-						 bdev_get_queue(devs[i]));
+		err = blk_crypto_start_using_key(devs[i], blk_key);
 		if (err)
 			break;
 	}
@@ -224,7 +222,7 @@ void fscrypt_destroy_inline_crypt_key(struct super_block *sb,
 	devs = fscrypt_get_devices(sb, &num_devs);
 	if (!IS_ERR(devs)) {
 		for (i = 0; i < num_devs; i++)
-			blk_crypto_evict_key(bdev_get_queue(devs[i]), blk_key);
+			blk_crypto_evict_key(devs[i], blk_key);
 		kfree(devs);
 	}
 	kfree_sensitive(blk_key);
-- 
cgit 


From 6715c98b6cf003f26b1b2f655393134e9d999a05 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 14 Nov 2022 05:29:43 +0100
Subject: blk-crypto: add a blk_crypto_config_supported_natively helper

Add a blk_crypto_config_supported_natively helper that wraps
__blk_crypto_cfg_supported to retrieve the crypto_profile from the
request queue.  With this fscrypt can stop including
blk-crypto-profile.h and rely on the public consumer interface in
blk-crypto.h.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20221114042944.1009870-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/crypto/inline_crypt.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index 55c4d8c23d30..8bfb3ce86476 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -12,7 +12,7 @@
  * provides the key and IV to use.
  */
 
-#include <linux/blk-crypto-profile.h>
+#include <linux/blk-crypto.h>
 #include <linux/blkdev.h>
 #include <linux/buffer_head.h>
 #include <linux/sched/mm.h>
@@ -77,10 +77,8 @@ static void fscrypt_log_blk_crypto_impl(struct fscrypt_mode *mode,
 	unsigned int i;
 
 	for (i = 0; i < num_devs; i++) {
-		struct request_queue *q = bdev_get_queue(devs[i]);
-
 		if (!IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) ||
-		    __blk_crypto_cfg_supported(q->crypto_profile, cfg)) {
+		    blk_crypto_config_supported_natively(devs[i], cfg)) {
 			if (!xchg(&mode->logged_blk_crypto_native, 1))
 				pr_info("fscrypt: %s using blk-crypto (native)\n",
 					mode->friendly_name);
-- 
cgit 


From 4e45886956a20942800259f326a04417292ae314 Mon Sep 17 00:00:00 2001
From: Zhang Xiaoxu <zhangxiaoxu5@huawei.com>
Date: Sun, 20 Nov 2022 18:57:59 +0800
Subject: zonefs: Fix race between modprobe and mount

There is a race between modprobe and mount as below:

 modprobe zonefs                | mount -t zonefs
--------------------------------|-------------------------
 zonefs_init                    |
  register_filesystem       [1] |
                                | zonefs_fill_super    [2]
  zonefs_sysfs_init         [3] |

1. register zonefs suceess, then
2. user can mount the zonefs
3. if sysfs initialize failed, the module initialize failed.

Then the mount process maybe some error happened since the module
initialize failed.

Let's register zonefs after all dependency resource ready. And
reorder the dependency resource release in module exit.

Fixes: 9277a6d4fbd4 ("zonefs: Export open zone resource information through sysfs")
Signed-off-by: Zhang Xiaoxu <zhangxiaoxu5@huawei.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 fs/zonefs/super.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index abc9a85106f2..f0e8a000f073 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -1922,18 +1922,18 @@ static int __init zonefs_init(void)
 	if (ret)
 		return ret;
 
-	ret = register_filesystem(&zonefs_type);
+	ret = zonefs_sysfs_init();
 	if (ret)
 		goto destroy_inodecache;
 
-	ret = zonefs_sysfs_init();
+	ret = register_filesystem(&zonefs_type);
 	if (ret)
-		goto unregister_fs;
+		goto sysfs_exit;
 
 	return 0;
 
-unregister_fs:
-	unregister_filesystem(&zonefs_type);
+sysfs_exit:
+	zonefs_sysfs_exit();
 destroy_inodecache:
 	zonefs_destroy_inodecache();
 
@@ -1942,9 +1942,9 @@ destroy_inodecache:
 
 static void __exit zonefs_exit(void)
 {
+	unregister_filesystem(&zonefs_type);
 	zonefs_sysfs_exit();
 	zonefs_destroy_inodecache();
-	unregister_filesystem(&zonefs_type);
 }
 
 MODULE_AUTHOR("Damien Le Moal");
-- 
cgit 


From 03e02acda8e267a8183e1e0ed289ff1ef9cd7ed8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 20 Nov 2022 10:13:44 -0700
Subject: eventfd: provide a eventfd_signal_mask() helper

This is identical to eventfd_signal(), but it allows the caller to pass
in a mask to be used for the poll wakeup key. The use case is avoiding
repeated multishot triggers if we have a dependency between eventfd and
io_uring.

If we setup an eventfd context and register that as the io_uring eventfd,
and at the same time queue a multishot poll request for the eventfd
context, then any CQE posted will repeatedly trigger the multishot request
until it terminates when the CQ ring overflows.

In preparation for io_uring detecting this circular dependency, add the
mentioned helper so that io_uring can pass in EPOLL_URING as part of the
poll wakeup key.

Cc: stable@vger.kernel.org # 6.0
[axboe: fold in !CONFIG_EVENTFD fix from Zhang Qilong]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/eventfd.c | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/eventfd.c b/fs/eventfd.c
index c0ffee99ad23..249ca6c0b784 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -43,21 +43,7 @@ struct eventfd_ctx {
 	int id;
 };
 
-/**
- * eventfd_signal - Adds @n to the eventfd counter.
- * @ctx: [in] Pointer to the eventfd context.
- * @n: [in] Value of the counter to be added to the eventfd internal counter.
- *          The value cannot be negative.
- *
- * This function is supposed to be called by the kernel in paths that do not
- * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
- * value, and we signal this as overflow condition by returning a EPOLLERR
- * to poll(2).
- *
- * Returns the amount by which the counter was incremented.  This will be less
- * than @n if the counter has overflowed.
- */
-__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
+__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, unsigned mask)
 {
 	unsigned long flags;
 
@@ -78,12 +64,31 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
 		n = ULLONG_MAX - ctx->count;
 	ctx->count += n;
 	if (waitqueue_active(&ctx->wqh))
-		wake_up_locked_poll(&ctx->wqh, EPOLLIN);
+		wake_up_locked_poll(&ctx->wqh, EPOLLIN | mask);
 	current->in_eventfd = 0;
 	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
 
 	return n;
 }
+
+/**
+ * eventfd_signal - Adds @n to the eventfd counter.
+ * @ctx: [in] Pointer to the eventfd context.
+ * @n: [in] Value of the counter to be added to the eventfd internal counter.
+ *          The value cannot be negative.
+ *
+ * This function is supposed to be called by the kernel in paths that do not
+ * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
+ * value, and we signal this as overflow condition by returning a EPOLLERR
+ * to poll(2).
+ *
+ * Returns the amount by which the counter was incremented.  This will be less
+ * than @n if the counter has overflowed.
+ */
+__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
+{
+	return eventfd_signal_mask(ctx, n, 0);
+}
 EXPORT_SYMBOL_GPL(eventfd_signal);
 
 static void eventfd_free_ctx(struct eventfd_ctx *ctx)
-- 
cgit 


From 7a5e9f1f83e3271a9f05933a80b870fe55ebbb3d Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Tue, 22 Nov 2022 09:48:01 -0500
Subject: fs: dlm: fix building without lockdep

This patch uses assert_spin_locked() instead of lockdep_is_held()
where it's available to use because lockdep_is_held() is only available
if CONFIG_LOCKDEP is set.

In other cases like lockdep_sock_is_held() we surround it by a
CONFIG_LOCKDEP idef.

Fixes: dbb751ffab0b ("fs: dlm: parallelize lowcomms socket handling")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 799d1c36eabf..8b80ca0cd65f 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -208,7 +208,7 @@ bool dlm_lowcomms_is_running(void)
 
 static void lowcomms_queue_swork(struct connection *con)
 {
-	WARN_ON_ONCE(!lockdep_is_held(&con->writequeue_lock));
+	assert_spin_locked(&con->writequeue_lock);
 
 	if (!test_bit(CF_IO_STOP, &con->flags) &&
 	    !test_bit(CF_APP_LIMITED, &con->flags) &&
@@ -218,7 +218,9 @@ static void lowcomms_queue_swork(struct connection *con)
 
 static void lowcomms_queue_rwork(struct connection *con)
 {
+#ifdef CONFIG_LOCKDEP
 	WARN_ON_ONCE(!lockdep_sock_is_held(con->sock->sk));
+#endif
 
 	if (!test_bit(CF_IO_STOP, &con->flags) &&
 	    !test_and_set_bit(CF_RECV_PENDING, &con->flags))
@@ -618,7 +620,9 @@ static void lowcomms_error_report(struct sock *sk)
 
 static void restore_callbacks(struct sock *sk)
 {
+#ifdef CONFIG_LOCKDEP
 	WARN_ON_ONCE(!lockdep_sock_is_held(sk));
+#endif
 
 	sk->sk_user_data = NULL;
 	sk->sk_data_ready = listen_sock.sk_data_ready;
-- 
cgit 


From 4e3c51f4e805291b057d12f5dda5aeb50a538dc4 Mon Sep 17 00:00:00 2001
From: Svyatoslav Feldsherov <feldsherov@google.com>
Date: Tue, 15 Nov 2022 20:20:01 +0000
Subject: fs: do not update freeing inode i_io_list

After commit cbfecb927f42 ("fs: record I_DIRTY_TIME even if inode
already has I_DIRTY_INODE") writeback_single_inode can push inode with
I_DIRTY_TIME set to b_dirty_time list. In case of freeing inode with
I_DIRTY_TIME set this can happen after deletion of inode from i_io_list
at evict. Stack trace is following.

evict
fat_evict_inode
fat_truncate_blocks
fat_flush_inodes
writeback_inode
sync_inode_metadata(inode, sync=0)
writeback_single_inode(inode, wbc) <- wbc->sync_mode == WB_SYNC_NONE

This will lead to use after free in flusher thread.

Similar issue can be triggered if writeback_single_inode in the
stack trace update inode->i_io_list. Add explicit check to avoid it.

Fixes: cbfecb927f42 ("fs: record I_DIRTY_TIME even if inode already has I_DIRTY_INODE")
Reported-by: syzbot+6ba92bd00d5093f7e371@syzkaller.appspotmail.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Svyatoslav Feldsherov <feldsherov@google.com>
Link: https://lore.kernel.org/r/20221115202001.324188-1-feldsherov@google.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/fs-writeback.c | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 443f83382b9b..9958d4020771 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1712,18 +1712,26 @@ static int writeback_single_inode(struct inode *inode,
 	wb = inode_to_wb_and_lock_list(inode);
 	spin_lock(&inode->i_lock);
 	/*
-	 * If the inode is now fully clean, then it can be safely removed from
-	 * its writeback list (if any).  Otherwise the flusher threads are
-	 * responsible for the writeback lists.
+	 * If the inode is freeing, its i_io_list shoudn't be updated
+	 * as it can be finally deleted at this moment.
 	 */
-	if (!(inode->i_state & I_DIRTY_ALL))
-		inode_cgwb_move_to_attached(inode, wb);
-	else if (!(inode->i_state & I_SYNC_QUEUED)) {
-		if ((inode->i_state & I_DIRTY))
-			redirty_tail_locked(inode, wb);
-		else if (inode->i_state & I_DIRTY_TIME) {
-			inode->dirtied_when = jiffies;
-			inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
+	if (!(inode->i_state & I_FREEING)) {
+		/*
+		 * If the inode is now fully clean, then it can be safely
+		 * removed from its writeback list (if any). Otherwise the
+		 * flusher threads are responsible for the writeback lists.
+		 */
+		if (!(inode->i_state & I_DIRTY_ALL))
+			inode_cgwb_move_to_attached(inode, wb);
+		else if (!(inode->i_state & I_SYNC_QUEUED)) {
+			if ((inode->i_state & I_DIRTY))
+				redirty_tail_locked(inode, wb);
+			else if (inode->i_state & I_DIRTY_TIME) {
+				inode->dirtied_when = jiffies;
+				inode_io_list_move_locked(inode,
+							  wb,
+							  &wb->b_dirty_time);
+			}
 		}
 	}
 
-- 
cgit 


From 572302af1258459e124437b8f3369357447afac7 Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@huawei.com>
Date: Thu, 10 Nov 2022 10:46:35 +0100
Subject: reiserfs: Add missing calls to reiserfs_security_free()

Commit 57fe60df6241 ("reiserfs: add atomic addition of selinux attributes
during inode creation") defined reiserfs_security_free() to free the name
and value of a security xattr allocated by the active LSM through
security_old_inode_init_security(). However, this function is not called
in the reiserfs code.

Thus, add a call to reiserfs_security_free() whenever
reiserfs_security_init() is called, and initialize value to NULL, to avoid
to call kfree() on an uninitialized pointer.

Finally, remove the kfree() for the xattr name, as it is not allocated
anymore.

Fixes: 57fe60df6241 ("reiserfs: add atomic addition of selinux attributes during inode creation")
Cc: stable@vger.kernel.org
Cc: Jeff Mahoney <jeffm@suse.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reported-by: Mimi Zohar <zohar@linux.ibm.com>
Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Reviewed-by: Mimi Zohar <zohar@linux.ibm.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 fs/reiserfs/namei.c          | 4 ++++
 fs/reiserfs/xattr_security.c | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 3d7a35d6a18b..b916859992ec 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -696,6 +696,7 @@ static int reiserfs_create(struct user_namespace *mnt_userns, struct inode *dir,
 
 out_failed:
 	reiserfs_write_unlock(dir->i_sb);
+	reiserfs_security_free(&security);
 	return retval;
 }
 
@@ -779,6 +780,7 @@ static int reiserfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
 
 out_failed:
 	reiserfs_write_unlock(dir->i_sb);
+	reiserfs_security_free(&security);
 	return retval;
 }
 
@@ -878,6 +880,7 @@ static int reiserfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
 	retval = journal_end(&th);
 out_failed:
 	reiserfs_write_unlock(dir->i_sb);
+	reiserfs_security_free(&security);
 	return retval;
 }
 
@@ -1194,6 +1197,7 @@ static int reiserfs_symlink(struct user_namespace *mnt_userns,
 	retval = journal_end(&th);
 out_failed:
 	reiserfs_write_unlock(parent_dir->i_sb);
+	reiserfs_security_free(&security);
 	return retval;
 }
 
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 8965c8e5e172..857a65b05726 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -50,6 +50,7 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode,
 	int error;
 
 	sec->name = NULL;
+	sec->value = NULL;
 
 	/* Don't add selinux attributes on xattrs - they'll never get used */
 	if (IS_PRIVATE(dir))
@@ -95,7 +96,6 @@ int reiserfs_security_write(struct reiserfs_transaction_handle *th,
 
 void reiserfs_security_free(struct reiserfs_security_handle *sec)
 {
-	kfree(sec->name);
 	kfree(sec->value);
 	sec->name = NULL;
 	sec->value = NULL;
-- 
cgit 


From 198dd8aedee6a7d2de0dfa739f9a008a938f6848 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 23 Nov 2022 12:40:11 +1100
Subject: xfs: punching delalloc extents on write failure is racy

xfs_buffered_write_iomap_end() has a comment about the safety of
punching delalloc extents based holding the IOLOCK_EXCL. This
comment is wrong, and punching delalloc extents is not race free.

When we punch out a delalloc extent after a write failure in
xfs_buffered_write_iomap_end(), we punch out the page cache with
truncate_pagecache_range() before we punch out the delalloc extents.
At this point, we only hold the IOLOCK_EXCL, so there is nothing
stopping mmap() write faults racing with this cleanup operation,
reinstantiating a folio over the range we are about to punch and
hence requiring the delalloc extent to be kept.

If this race condition is hit, we can end up with a dirty page in
the page cache that has no delalloc extent or space reservation
backing it. This leads to bad things happening at writeback time.

To avoid this race condition, we need the page cache truncation to
be atomic w.r.t. the extent manipulation. We can do this by holding
the mapping->invalidate_lock exclusively across this operation -
this will prevent new pages from being inserted into the page cache
whilst we are removing the pages and the backing extent and space
reservation.

Taking the mapping->invalidate_lock exclusively in the buffered
write IO path is safe - it naturally nests inside the IOLOCK (see
truncate and fallocate paths). iomap_zero_range() can be called from
under the mapping->invalidate_lock (from the truncate path via
either xfs_zero_eof() or xfs_truncate_page(), but iomap_zero_iter()
will not instantiate new delalloc pages (because it skips holes) and
hence will not ever need to punch out delalloc extents on failure.

Fix the locking issue, and clean up the code logic a little to avoid
unnecessary work if we didn't allocate the delalloc extent or wrote
the entire region we allocated.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_iomap.c | 41 +++++++++++++++++++++++------------------
 1 file changed, 23 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 5cea069a38b4..a2e45ea1b0cb 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1147,6 +1147,10 @@ xfs_buffered_write_iomap_end(
 		written = 0;
 	}
 
+	/* If we didn't reserve the blocks, we're not allowed to punch them. */
+	if (!(iomap->flags & IOMAP_F_NEW))
+		return 0;
+
 	/*
 	 * start_fsb refers to the first unused block after a short write. If
 	 * nothing was written, round offset down to point at the first block in
@@ -1158,27 +1162,28 @@ xfs_buffered_write_iomap_end(
 		start_fsb = XFS_B_TO_FSB(mp, offset + written);
 	end_fsb = XFS_B_TO_FSB(mp, offset + length);
 
+	/* Nothing to do if we've written the entire delalloc extent */
+	if (start_fsb >= end_fsb)
+		return 0;
+
 	/*
-	 * Trim delalloc blocks if they were allocated by this write and we
-	 * didn't manage to write the whole range.
-	 *
-	 * We don't need to care about racing delalloc as we hold i_mutex
-	 * across the reserve/allocate/unreserve calls. If there are delalloc
-	 * blocks in the range, they are ours.
+	 * Lock the mapping to avoid races with page faults re-instantiating
+	 * folios and dirtying them via ->page_mkwrite between the page cache
+	 * truncation and the delalloc extent removal. Failing to do this can
+	 * leave dirty pages with no space reservation in the cache.
 	 */
-	if ((iomap->flags & IOMAP_F_NEW) && start_fsb < end_fsb) {
-		truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb),
-					 XFS_FSB_TO_B(mp, end_fsb) - 1);
-
-		error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
-					       end_fsb - start_fsb);
-		if (error && !xfs_is_shutdown(mp)) {
-			xfs_alert(mp, "%s: unable to clean up ino %lld",
-				__func__, ip->i_ino);
-			return error;
-		}
+	filemap_invalidate_lock(inode->i_mapping);
+	truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb),
+				 XFS_FSB_TO_B(mp, end_fsb) - 1);
+
+	error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+				       end_fsb - start_fsb);
+	filemap_invalidate_unlock(inode->i_mapping);
+	if (error && !xfs_is_shutdown(mp)) {
+		xfs_alert(mp, "%s: unable to clean up ino %lld",
+			__func__, ip->i_ino);
+		return error;
 	}
-
 	return 0;
 }
 
-- 
cgit 


From b71f889c18ada210a97aa3eb5e00c0de552234c6 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 23 Nov 2022 12:40:12 +1100
Subject: xfs: use byte ranges for write cleanup ranges

xfs_buffered_write_iomap_end() currently converts the byte ranges
passed to it to filesystem blocks to pass them to the bmap code to
punch out delalloc blocks, but then has to convert filesytem
blocks back to byte ranges for page cache truncate.

We're about to make the page cache truncate go away and replace it
with a page cache walk, so having to convert everything to/from/to
filesystem blocks is messy and error-prone. It is much easier to
pass around byte ranges and convert to page indexes and/or
filesystem blocks only where those units are needed.

In preparation for the page cache walk being added, add a helper
that converts byte ranges to filesystem blocks and calls
xfs_bmap_punch_delalloc_range() and convert
xfs_buffered_write_iomap_end() to calculate limits in byte ranges.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_iomap.c | 40 +++++++++++++++++++++++++---------------
 1 file changed, 25 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index a2e45ea1b0cb..7bb55dbc19d3 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1120,6 +1120,20 @@ out_unlock:
 	return error;
 }
 
+static int
+xfs_buffered_write_delalloc_punch(
+	struct inode		*inode,
+	loff_t			start_byte,
+	loff_t			end_byte)
+{
+	struct xfs_mount	*mp = XFS_M(inode->i_sb);
+	xfs_fileoff_t		start_fsb = XFS_B_TO_FSBT(mp, start_byte);
+	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, end_byte);
+
+	return xfs_bmap_punch_delalloc_range(XFS_I(inode), start_fsb,
+				end_fsb - start_fsb);
+}
+
 static int
 xfs_buffered_write_iomap_end(
 	struct inode		*inode,
@@ -1129,10 +1143,9 @@ xfs_buffered_write_iomap_end(
 	unsigned		flags,
 	struct iomap		*iomap)
 {
-	struct xfs_inode	*ip = XFS_I(inode);
-	struct xfs_mount	*mp = ip->i_mount;
-	xfs_fileoff_t		start_fsb;
-	xfs_fileoff_t		end_fsb;
+	struct xfs_mount	*mp = XFS_M(inode->i_sb);
+	loff_t			start_byte;
+	loff_t			end_byte;
 	int			error = 0;
 
 	if (iomap->type != IOMAP_DELALLOC)
@@ -1157,13 +1170,13 @@ xfs_buffered_write_iomap_end(
 	 * the range.
 	 */
 	if (unlikely(!written))
-		start_fsb = XFS_B_TO_FSBT(mp, offset);
+		start_byte = round_down(offset, mp->m_sb.sb_blocksize);
 	else
-		start_fsb = XFS_B_TO_FSB(mp, offset + written);
-	end_fsb = XFS_B_TO_FSB(mp, offset + length);
+		start_byte = round_up(offset + written, mp->m_sb.sb_blocksize);
+	end_byte = round_up(offset + length, mp->m_sb.sb_blocksize);
 
 	/* Nothing to do if we've written the entire delalloc extent */
-	if (start_fsb >= end_fsb)
+	if (start_byte >= end_byte)
 		return 0;
 
 	/*
@@ -1173,15 +1186,12 @@ xfs_buffered_write_iomap_end(
 	 * leave dirty pages with no space reservation in the cache.
 	 */
 	filemap_invalidate_lock(inode->i_mapping);
-	truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb),
-				 XFS_FSB_TO_B(mp, end_fsb) - 1);
-
-	error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
-				       end_fsb - start_fsb);
+	truncate_pagecache_range(inode, start_byte, end_byte - 1);
+	error = xfs_buffered_write_delalloc_punch(inode, start_byte, end_byte);
 	filemap_invalidate_unlock(inode->i_mapping);
 	if (error && !xfs_is_shutdown(mp)) {
-		xfs_alert(mp, "%s: unable to clean up ino %lld",
-			__func__, ip->i_ino);
+		xfs_alert(mp, "%s: unable to clean up ino 0x%llx",
+			__func__, XFS_I(inode)->i_ino);
 		return error;
 	}
 	return 0;
-- 
cgit 


From 9c7babf94a0d686b552e53aded8d4703d1b8b92b Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 23 Nov 2022 12:44:38 +1100
Subject: xfs,iomap: move delalloc punching to iomap

Because that's what Christoph wants for this error handling path
only XFS uses.

It requires a new iomap export for handling errors over delalloc
ranges. This is basically the XFS code as is stands, but even though
Christoph wants this as iomap funcitonality, we still have
to call it from the filesystem specific ->iomap_end callback, and
call into the iomap code with yet another filesystem specific
callback to punch the delalloc extent within the defined ranges.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/iomap/buffered-io.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_iomap.c     | 47 +++++++--------------------------------
 2 files changed, 68 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 91ee0b308e13..734b761a1e4a 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -832,6 +832,66 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
 }
 EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
 
+/*
+ * When a short write occurs, the filesystem may need to remove reserved space
+ * that was allocated in ->iomap_begin from it's ->iomap_end method. For
+ * filesystems that use delayed allocation, we need to punch out delalloc
+ * extents from the range that are not dirty in the page cache. As the write can
+ * race with page faults, there can be dirty pages over the delalloc extent
+ * outside the range of a short write but still within the delalloc extent
+ * allocated for this iomap.
+ *
+ * This function uses [start_byte, end_byte) intervals (i.e. open ended) to
+ * simplify range iterations, but converts them back to {offset,len} tuples for
+ * the punch callback.
+ */
+int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
+		struct iomap *iomap, loff_t pos, loff_t length,
+		ssize_t written,
+		int (*punch)(struct inode *inode, loff_t pos, loff_t length))
+{
+	loff_t			start_byte;
+	loff_t			end_byte;
+	int			blocksize = i_blocksize(inode);
+	int			error = 0;
+
+	if (iomap->type != IOMAP_DELALLOC)
+		return 0;
+
+	/* If we didn't reserve the blocks, we're not allowed to punch them. */
+	if (!(iomap->flags & IOMAP_F_NEW))
+		return 0;
+
+	/*
+	 * start_byte refers to the first unused block after a short write. If
+	 * nothing was written, round offset down to point at the first block in
+	 * the range.
+	 */
+	if (unlikely(!written))
+		start_byte = round_down(pos, blocksize);
+	else
+		start_byte = round_up(pos + written, blocksize);
+	end_byte = round_up(pos + length, blocksize);
+
+	/* Nothing to do if we've written the entire delalloc extent */
+	if (start_byte >= end_byte)
+		return 0;
+
+	/*
+	 * Lock the mapping to avoid races with page faults re-instantiating
+	 * folios and dirtying them via ->page_mkwrite between the page cache
+	 * truncation and the delalloc extent removal. Failing to do this can
+	 * leave dirty pages with no space reservation in the cache.
+	 */
+	filemap_invalidate_lock(inode->i_mapping);
+	truncate_pagecache_range(inode, start_byte, end_byte - 1);
+	error = punch(inode, start_byte, end_byte - start_byte);
+	filemap_invalidate_unlock(inode->i_mapping);
+
+	return error;
+}
+EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc);
+
 static loff_t iomap_unshare_iter(struct iomap_iter *iter)
 {
 	struct iomap *iomap = &iter->iomap;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 7bb55dbc19d3..ea96e8a34868 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1123,12 +1123,12 @@ out_unlock:
 static int
 xfs_buffered_write_delalloc_punch(
 	struct inode		*inode,
-	loff_t			start_byte,
-	loff_t			end_byte)
+	loff_t			offset,
+	loff_t			length)
 {
 	struct xfs_mount	*mp = XFS_M(inode->i_sb);
-	xfs_fileoff_t		start_fsb = XFS_B_TO_FSBT(mp, start_byte);
-	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, end_byte);
+	xfs_fileoff_t		start_fsb = XFS_B_TO_FSBT(mp, offset);
+	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + length);
 
 	return xfs_bmap_punch_delalloc_range(XFS_I(inode), start_fsb,
 				end_fsb - start_fsb);
@@ -1143,13 +1143,9 @@ xfs_buffered_write_iomap_end(
 	unsigned		flags,
 	struct iomap		*iomap)
 {
-	struct xfs_mount	*mp = XFS_M(inode->i_sb);
-	loff_t			start_byte;
-	loff_t			end_byte;
-	int			error = 0;
 
-	if (iomap->type != IOMAP_DELALLOC)
-		return 0;
+	struct xfs_mount	*mp = XFS_M(inode->i_sb);
+	int			error;
 
 	/*
 	 * Behave as if the write failed if drop writes is enabled. Set the NEW
@@ -1160,35 +1156,8 @@ xfs_buffered_write_iomap_end(
 		written = 0;
 	}
 
-	/* If we didn't reserve the blocks, we're not allowed to punch them. */
-	if (!(iomap->flags & IOMAP_F_NEW))
-		return 0;
-
-	/*
-	 * start_fsb refers to the first unused block after a short write. If
-	 * nothing was written, round offset down to point at the first block in
-	 * the range.
-	 */
-	if (unlikely(!written))
-		start_byte = round_down(offset, mp->m_sb.sb_blocksize);
-	else
-		start_byte = round_up(offset + written, mp->m_sb.sb_blocksize);
-	end_byte = round_up(offset + length, mp->m_sb.sb_blocksize);
-
-	/* Nothing to do if we've written the entire delalloc extent */
-	if (start_byte >= end_byte)
-		return 0;
-
-	/*
-	 * Lock the mapping to avoid races with page faults re-instantiating
-	 * folios and dirtying them via ->page_mkwrite between the page cache
-	 * truncation and the delalloc extent removal. Failing to do this can
-	 * leave dirty pages with no space reservation in the cache.
-	 */
-	filemap_invalidate_lock(inode->i_mapping);
-	truncate_pagecache_range(inode, start_byte, end_byte - 1);
-	error = xfs_buffered_write_delalloc_punch(inode, start_byte, end_byte);
-	filemap_invalidate_unlock(inode->i_mapping);
+	error = iomap_file_buffered_write_punch_delalloc(inode, iomap, offset,
+			length, written, &xfs_buffered_write_delalloc_punch);
 	if (error && !xfs_is_shutdown(mp)) {
 		xfs_alert(mp, "%s: unable to clean up ino 0x%llx",
 			__func__, XFS_I(inode)->i_ino);
-- 
cgit 


From f850c84948ef2d4f5e11fd8e528c2ac3b3c3d9c4 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed@google.com>
Date: Thu, 17 Nov 2022 04:32:47 +0000
Subject: proc/meminfo: fix spacing in SecPageTables

SecPageTables has a tab after it instead of a space, this can break
fragile parsers that depend on spaces after the stat names.

Link: https://lkml.kernel.org/r/20221117043247.133294-1-yosryahmed@google.com
Fixes: ebc97a52b5d6cd5f ("mm: add NR_SECONDARY_PAGETABLE to count secondary page table uses.")
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Shakeel Butt <shakeelb@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Sean Christopherson <seanjc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/meminfo.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 5101131e6047..440960110a42 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -115,7 +115,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 #endif
 	show_val_kb(m, "PageTables:     ",
 		    global_node_page_state(NR_PAGETABLE));
-	show_val_kb(m, "SecPageTables:	",
+	show_val_kb(m, "SecPageTables:  ",
 		    global_node_page_state(NR_SECONDARY_PAGETABLE));
 
 	show_val_kb(m, "NFS_Unstable:   ", 0);
-- 
cgit 


From 512c5ca01a3610ab14ff6309db363de51f1c13a6 Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Fri, 18 Nov 2022 14:33:04 +0800
Subject: nilfs2: fix nilfs_sufile_mark_dirty() not set segment usage as dirty

When extending segments, nilfs_sufile_alloc() is called to get an
unassigned segment, then mark it as dirty to avoid accidentally allocating
the same segment in the future.

But for some special cases such as a corrupted image it can be unreliable.
If such corruption of the dirty state of the segment occurs, nilfs2 may
reallocate a segment that is in use and pick the same segment for writing
twice at the same time.

This will cause the problem reported by syzkaller:
https://syzkaller.appspot.com/bug?id=c7c4748e11ffcc367cef04f76e02e931833cbd24

This case started with segbuf1.segnum = 3, nextnum = 4 when constructed.
It supposed segment 4 has already been allocated and marked as dirty.

However the dirty state was corrupted and segment 4 usage was not dirty.
For the first time nilfs_segctor_extend_segments() segment 4 was allocated
again, which made segbuf2 and next segbuf3 had same segment 4.

sb_getblk() will get same bh for segbuf2 and segbuf3, and this bh is added
to both buffer lists of two segbuf.  It makes the lists broken which
causes NULL pointer dereference.

Fix the problem by setting usage as dirty every time in
nilfs_sufile_mark_dirty(), which is called during constructing current
segment to be written out and before allocating next segment.

[chenzhongjin@huawei.com: add lock protection per Ryusuke]
  Link: https://lkml.kernel.org/r/20221121091141.214703-1-chenzhongjin@huawei.com
Link: https://lkml.kernel.org/r/20221118063304.140187-1-chenzhongjin@huawei.com
Fixes: 9ff05123e3bf ("nilfs2: segment constructor")
Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
Reported-by: <syzbot+77e4f0...@syzkaller.appspotmail.com>
Reported-by: Liu Shixin <liushixin2@huawei.com>
Acked-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Tested-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/sufile.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 77ff8e95421f..dc359b56fdfa 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -495,14 +495,22 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
 int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum)
 {
 	struct buffer_head *bh;
+	void *kaddr;
+	struct nilfs_segment_usage *su;
 	int ret;
 
+	down_write(&NILFS_MDT(sufile)->mi_sem);
 	ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh);
 	if (!ret) {
 		mark_buffer_dirty(bh);
 		nilfs_mdt_mark_dirty(sufile);
+		kaddr = kmap_atomic(bh->b_page);
+		su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
+		nilfs_segment_usage_set_dirty(su);
+		kunmap_atomic(kaddr);
 		brelse(bh);
 	}
+	up_write(&NILFS_MDT(sufile)->mi_sem);
 	return ret;
 }
 
-- 
cgit 


From 44361e8cf9ddb23f17bdcc40ca944abf32e83e79 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Wed, 23 Nov 2022 09:10:42 +0100
Subject: fuse: lock inode unconditionally in fuse_fallocate()

file_modified() must be called with inode lock held.  fuse_fallocate()
didn't lock the inode in case of just FALLOC_KEEP_SIZE flags value, which
resulted in a kernel Warning in notify_change().

Lock the inode unconditionally, like all other fallocate implementations
do.

Reported-by: Pengfei Xu <pengfei.xu@intel.com>
Reported-and-tested-by: syzbot+462da39f0667b357c4b6@syzkaller.appspotmail.com
Fixes: 4a6f278d4827 ("fuse: add file_modified() to fallocate")
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/file.c | 37 ++++++++++++++++---------------------
 1 file changed, 16 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 71bfb663aac5..89f4741728ba 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2963,11 +2963,9 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 		.mode = mode
 	};
 	int err;
-	bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
-			   (mode & (FALLOC_FL_PUNCH_HOLE |
-				    FALLOC_FL_ZERO_RANGE));
-
-	bool block_faults = FUSE_IS_DAX(inode) && lock_inode;
+	bool block_faults = FUSE_IS_DAX(inode) &&
+		(!(mode & FALLOC_FL_KEEP_SIZE) ||
+		 (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)));
 
 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
 		     FALLOC_FL_ZERO_RANGE))
@@ -2976,22 +2974,20 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 	if (fm->fc->no_fallocate)
 		return -EOPNOTSUPP;
 
-	if (lock_inode) {
-		inode_lock(inode);
-		if (block_faults) {
-			filemap_invalidate_lock(inode->i_mapping);
-			err = fuse_dax_break_layouts(inode, 0, 0);
-			if (err)
-				goto out;
-		}
+	inode_lock(inode);
+	if (block_faults) {
+		filemap_invalidate_lock(inode->i_mapping);
+		err = fuse_dax_break_layouts(inode, 0, 0);
+		if (err)
+			goto out;
+	}
 
-		if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) {
-			loff_t endbyte = offset + length - 1;
+	if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) {
+		loff_t endbyte = offset + length - 1;
 
-			err = fuse_writeback_range(inode, offset, endbyte);
-			if (err)
-				goto out;
-		}
+		err = fuse_writeback_range(inode, offset, endbyte);
+		if (err)
+			goto out;
 	}
 
 	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
@@ -3039,8 +3035,7 @@ out:
 	if (block_faults)
 		filemap_invalidate_unlock(inode->i_mapping);
 
-	if (lock_inode)
-		inode_unlock(inode);
+	inode_unlock(inode);
 
 	fuse_flush_time_update(inode);
 
-- 
cgit 


From a1db2f7edef095a385d477ab81e780694d63eebd Mon Sep 17 00:00:00 2001
From: "Fabio M. De Francesco" <fmdefrancesco@gmail.com>
Date: Wed, 12 Oct 2022 13:23:23 +0200
Subject: fs/fuse: Replace kmap() with kmap_local_page()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The use of kmap() is being deprecated in favor of kmap_local_page().

There are two main problems with kmap(): (1) It comes with an overhead as
the mapping space is restricted and protected by a global lock for
synchronization and (2) it also requires global TLB invalidation when the
kmap’s pool wraps and it might block when the mapping space is fully
utilized until a slot becomes available.

With kmap_local_page() the mappings are per thread, CPU local, can take
page faults, and can be called from any context (including interrupts).
It is faster than kmap() in kernels with HIGHMEM enabled. Furthermore,
the tasks can be preempted and, when they are scheduled to run again, the
kernel virtual addresses are restored and still valid.

Therefore, replace kmap() with kmap_local_page() in fuse_readdir_cached(),
it being the only call site of kmap() currently left in fs/fuse.

Cc: "Venkataramanan, Anirudh" <anirudh.venkataramanan@intel.com>
Suggested-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Fabio M. De Francesco <fmdefrancesco@gmail.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/readdir.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index e8deaacf1832..dc603479b30e 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -547,9 +547,9 @@ retry_locked:
 	 * Contents of the page are now protected against changing by holding
 	 * the page lock.
 	 */
-	addr = kmap(page);
+	addr = kmap_local_page(page);
 	res = fuse_parse_cache(ff, addr, size, ctx);
-	kunmap(page);
+	kunmap_local(addr);
 	unlock_page(page);
 	put_page(page);
 
-- 
cgit 


From 4f8d37020e1fd0bf6ee9381ba918135ef3712efd Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Fri, 28 Oct 2022 14:25:21 +0200
Subject: fuse: add "expire only" mode to FUSE_NOTIFY_INVAL_ENTRY

Add a flag to entry expiration that lets the filesystem expire a dentry
without kicking it out from the cache immediately.

This makes a difference for overmounted dentries, where plain invalidation
would detach all submounts before dropping the dentry from the cache.  If
only expiry is set on the dentry, then any overmounts are left alone and
until ->d_revalidate() is called.

Note: ->d_revalidate() is not called for the case of following a submount,
so invalidation will only be triggered for the non-overmounted case.  The
dentry could also be mounted in a different mount instance, in which case
any submounts will still be detached.

Suggested-by: Jakob Blomer <jblomer@cern.ch>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/dev.c    | 4 ++--
 fs/fuse/dir.c    | 6 ++++--
 fs/fuse/fuse_i.h | 2 +-
 3 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index b4a6e0a1b945..0a845fac3ba8 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1498,7 +1498,7 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
 	buf[outarg.namelen] = 0;
 
 	down_read(&fc->killsb);
-	err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name);
+	err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name, outarg.flags);
 	up_read(&fc->killsb);
 	kfree(buf);
 	return err;
@@ -1546,7 +1546,7 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size,
 	buf[outarg.namelen] = 0;
 
 	down_read(&fc->killsb);
-	err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name);
+	err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name, 0);
 	up_read(&fc->killsb);
 	kfree(buf);
 	return err;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index bb97a384dc5d..d092c7d75929 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1170,7 +1170,7 @@ int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask)
 }
 
 int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
-			     u64 child_nodeid, struct qstr *name)
+			     u64 child_nodeid, struct qstr *name, u32 flags)
 {
 	int err = -ENOTDIR;
 	struct inode *parent;
@@ -1197,7 +1197,9 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
 		goto unlock;
 
 	fuse_dir_changed(parent);
-	fuse_invalidate_entry(entry);
+	if (!(flags & FUSE_EXPIRE_ONLY))
+		d_invalidate(entry);
+	fuse_invalidate_entry_cache(entry);
 
 	if (child_nodeid != 0 && d_really_is_positive(entry)) {
 		inode_lock(d_inode(entry));
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 98a9cf531873..8789d94eda21 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1220,7 +1220,7 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid,
  * then the dentry is unhashed (d_delete()).
  */
 int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
-			     u64 child_nodeid, struct qstr *name);
+			     u64 child_nodeid, struct qstr *name, u32 flags);
 
 int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
 		 bool isdir);
-- 
cgit 


From ccc031e26afe60d2a5a3d93dabd9c978210825fb Mon Sep 17 00:00:00 2001
From: Jiachen Zhang <zhangjiachen.jaycee@bytedance.com>
Date: Wed, 28 Sep 2022 20:19:34 +0800
Subject: fuse: always revalidate rename target dentry

The previous commit df8629af2934 ("fuse: always revalidate if exclusive
create") ensures that the dentries are revalidated on O_EXCL creates.  This
commit complements it by also performing revalidation for rename target
dentries.  Otherwise, a rename target file that only exists in kernel
dentry cache but not in the filesystem will result in EEXIST if
RENAME_NOREPLACE flag is used.

Signed-off-by: Jiachen Zhang <zhangjiachen.jaycee@bytedance.com>
Signed-off-by: Zhang Tianci <zhangtianci.1997@bytedance.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index d092c7d75929..2c4b08a6ec81 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -214,7 +214,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
 	if (inode && fuse_is_bad(inode))
 		goto invalid;
 	else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) ||
-		 (flags & (LOOKUP_EXCL | LOOKUP_REVAL))) {
+		 (flags & (LOOKUP_EXCL | LOOKUP_REVAL | LOOKUP_RENAME_TARGET))) {
 		struct fuse_entry_out outarg;
 		FUSE_ARGS(args);
 		struct fuse_forget_link *forget;
-- 
cgit 


From 0618021e34c6b1edc4fc754ec53ab7fdcc98aaec Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Wed, 14 Sep 2022 16:26:32 +0200
Subject: fuse: Remove user_ns check for FUSE_DEV_IOC_CLONE

Commit 8ed1f0e22f49e ("fs/fuse: fix ioctl type confusion") fixed a type
confusion bug by adding an ->f_op comparison.

Based on some off-list discussion back then, another check was added to
compare the f_cred->user_ns.  This is not for security reasons, but was
based on the idea that a FUSE device FD should be using the UID/GID
mappings of its f_cred->user_ns, and those translations are done using
fc->user_ns, which matches the f_cred->user_ns of the initial FUSE device
FD thanks to the check in fuse_fill_super().  See also commit 8cb08329b0809
("fuse: Support fuse filesystems outside of init_user_ns").

But FUSE_DEV_IOC_CLONE is, at a higher level, a *cloning* operation that
copies an existing context (with a weird API that involves first opening
/dev/fuse, then tying the resulting new FUSE device FD to an existing FUSE
instance).  So if an application is already passing FUSE FDs across userns
boundaries and dealing with the resulting ID mapping complications somehow,
it doesn't make much sense to block this cloning operation.

I've heard that this check is an obstacle for some folks, and I don't see a
good reason to keep it, so remove it.

Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/dev.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 0a845fac3ba8..c73d9c4132f6 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -2267,8 +2267,7 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
 				 * Check against file->f_op because CUSE
 				 * uses the same ioctl handler.
 				 */
-				if (old->f_op == file->f_op &&
-				    old->f_cred->user_ns == file->f_cred->user_ns)
+				if (old->f_op == file->f_op)
 					fud = fuse_get_dev(old);
 
 				if (fud) {
-- 
cgit 


From 00d369bc2de5986504fc0d60bfeae4c342b2cad5 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 9 Sep 2022 11:40:21 +0200
Subject: fuse: port to vfs{g,u}id_t and associated helpers

A while ago we introduced a dedicated vfs{g,u}id_t type in commit
1e5267cd0895 ("mnt_idmapping: add vfs{g,u}id_t").  We already switched over
a good part of the VFS.  Ultimately we will remove all legacy idmapped
mount helpers that operate only on k{g,u}id_t in favor of the new type safe
helpers that operate on vfs{g,u}id_t.

Cc: Seth Forshee (Digital Ocean) <sforshee@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/acl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c
index 337cb29a8dd5..84c1ca4bc1dc 100644
--- a/fs/fuse/acl.c
+++ b/fs/fuse/acl.c
@@ -98,7 +98,7 @@ int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
 			return ret;
 		}
 
-		if (!in_group_p(i_gid_into_mnt(&init_user_ns, inode)) &&
+		if (!vfsgid_in_group_p(i_gid_into_vfsgid(&init_user_ns, inode)) &&
 		    !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID))
 			extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID;
 
-- 
cgit 


From e2283a736676554f72dbdcb62fdc1d23daf7044f Mon Sep 17 00:00:00 2001
From: ye xingchen <ye.xingchen@zte.com.cn>
Date: Thu, 1 Sep 2022 07:42:59 +0000
Subject: fuse: remove the unneeded result variable

Return the value fuse_dev_release() directly instead of storing it in
another redundant variable.

Reported-by: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: ye xingchen <ye.xingchen@zte.com.cn>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/cuse.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index c7d882a9fe33..a06fbb1a8a5b 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -545,7 +545,6 @@ static int cuse_channel_release(struct inode *inode, struct file *file)
 {
 	struct fuse_dev *fud = file->private_data;
 	struct cuse_conn *cc = fc_to_cc(fud->fc);
-	int rc;
 
 	/* remove from the conntbl, no more access from this point on */
 	mutex_lock(&cuse_lock);
@@ -560,9 +559,7 @@ static int cuse_channel_release(struct inode *inode, struct file *file)
 		cdev_del(cc->cdev);
 	}
 
-	rc = fuse_dev_release(inode, file);	/* puts the base reference */
-
-	return rc;
+	return fuse_dev_release(inode, file);
 }
 
 static struct file_operations cuse_channel_fops; /* initialized during init */
-- 
cgit 


From 153524053bbb0d27bb2e0be36d1b46862e9ce74c Mon Sep 17 00:00:00 2001
From: Dharmendra Singh <dsingh@ddn.com>
Date: Fri, 17 Jun 2022 12:40:27 +0530
Subject: fuse: allow non-extending parallel direct writes on the same file

In general, as of now, in FUSE, direct writes on the same file are
serialized over inode lock i.e we hold inode lock for the full duration of
the write request.  I could not find in fuse code and git history a comment
which clearly explains why this exclusive lock is taken for direct writes.
Following might be the reasons for acquiring an exclusive lock but not be
limited to

 1) Our guess is some USER space fuse implementations might be relying on
    this lock for serialization.

 2) The lock protects against file read/write size races.

 3) Ruling out any issues arising from partial write failures.

This patch relaxes the exclusive lock for direct non-extending writes only.
File size extending writes might not need the lock either, but we are not
entirely sure if there is a risk to introduce any kind of regression.
Furthermore, benchmarking with fio does not show a difference between patch
versions that take on file size extension a) an exclusive lock and b) a
shared lock.

A possible example of an issue with i_size extending writes are write error
cases.  Some writes might succeed and others might fail for file system
internal reasons - for example ENOSPACE.  With parallel file size extending
writes it _might_ be difficult to revert the action of the failing write,
especially to restore the right i_size.

With these changes, we allow non-extending parallel direct writes on the
same file with the help of a flag called FOPEN_PARALLEL_DIRECT_WRITES.  If
this flag is set on the file (flag is passed from libfuse to fuse kernel as
part of file open/create), we do not take exclusive lock anymore, but
instead use a shared lock that allows non-extending writes to run in
parallel.  FUSE implementations which rely on this inode lock for
serialization can continue to do so and serialized direct writes are still
the default.  Implementations that do not do write serialization need to be
updated and need to set the FOPEN_PARALLEL_DIRECT_WRITES flag in their file
open/create reply.

On patch review there were concerns that network file systems (or vfs
multiple mounts of the same file system) might have issues with parallel
writes.  We believe this is not the case, as this is just a local lock,
which network file systems could not rely on anyway.  I.e. this lock is
just for local consistency.

Signed-off-by: Dharmendra Singh <dsingh@ddn.com>
Signed-off-by: Bernd Schubert <bschubert@ddn.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/file.c | 43 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 40 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 89f4741728ba..d360d38ab616 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1563,14 +1563,47 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	return res;
 }
 
+static bool fuse_direct_write_extending_i_size(struct kiocb *iocb,
+					       struct iov_iter *iter)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+
+	return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode);
+}
+
 static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct inode *inode = file_inode(iocb->ki_filp);
+	struct file *file = iocb->ki_filp;
+	struct fuse_file *ff = file->private_data;
 	struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
 	ssize_t res;
+	bool exclusive_lock =
+		!(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES) ||
+		iocb->ki_flags & IOCB_APPEND ||
+		fuse_direct_write_extending_i_size(iocb, from);
+
+	/*
+	 * Take exclusive lock if
+	 * - Parallel direct writes are disabled - a user space decision
+	 * - Parallel direct writes are enabled and i_size is being extended.
+	 *   This might not be needed at all, but needs further investigation.
+	 */
+	if (exclusive_lock)
+		inode_lock(inode);
+	else {
+		inode_lock_shared(inode);
+
+		/* A race with truncate might have come up as the decision for
+		 * the lock type was done without holding the lock, check again.
+		 */
+		if (fuse_direct_write_extending_i_size(iocb, from)) {
+			inode_unlock_shared(inode);
+			inode_lock(inode);
+			exclusive_lock = true;
+		}
+	}
 
-	/* Don't allow parallel writes to the same file */
-	inode_lock(inode);
 	res = generic_write_checks(iocb, from);
 	if (res > 0) {
 		if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
@@ -1581,7 +1614,10 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
 			fuse_write_update_attr(inode, iocb->ki_pos, res);
 		}
 	}
-	inode_unlock(inode);
+	if (exclusive_lock)
+		inode_unlock(inode);
+	else
+		inode_unlock_shared(inode);
 
 	return res;
 }
@@ -2931,6 +2967,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 
 	if (iov_iter_rw(iter) == WRITE) {
 		fuse_write_update_attr(inode, pos, ret);
+		/* For extending writes we already hold exclusive lock */
 		if (ret < 0 && offset + count > i_size)
 			fuse_do_truncate(file);
 	}
-- 
cgit 


From b138777786f780b9d5ce1989032608acbede0493 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Tue, 25 Oct 2022 09:10:17 -0700
Subject: fuse: Rearrange fuse_allow_current_process checks

This is a followup to a previous commit of mine [0], which added the
allow_sys_admin_access && capable(CAP_SYS_ADMIN) check.  This patch
rearranges the order of checks in fuse_allow_current_process without
changing functionality.

Commit 9ccf47b26b73 ("fuse: Add module param for CAP_SYS_ADMIN access
bypassing allow_other") added allow_sys_admin_access &&
capable(CAP_SYS_ADMIN) check to the beginning of the function, with the
reasoning that allow_sys_admin_access should be an 'escape hatch' for users
with CAP_SYS_ADMIN, allowing them to skip any subsequent checks.

However, placing this new check first results in many capable() calls when
allow_sys_admin_access is set, where another check would've also returned
1.  This can be problematic when a BPF program is tracing capable() calls.

At Meta we ran into such a scenario recently.  On a host where
allow_sys_admin_access is set but most of the FUSE access is from processes
which would pass other checks - i.e.  they don't need CAP_SYS_ADMIN 'escape
hatch' - this results in an unnecessary capable() call for each fs op.  We
also have a daemon tracing capable() with BPF and doing some data
collection, so tracing these extraneous capable() calls has the potential
to regress performance for an application doing many FUSE ops.

So rearrange the order of these checks such that CAP_SYS_ADMIN 'escape
hatch' is checked last.  Add a small helper, fuse_permissible_uidgid, to
make the logic easier to understand.  Previously, if allow_other is set on
the fuse_conn, uid/git checking doesn't happen as current_in_userns result
is returned.  These semantics are maintained here: fuse_permissible_uidgid
check only happens if allow_other is not set.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/dir.c    | 35 ++++++++++++++++++++---------------
 fs/fuse/fuse_i.h |  2 +-
 2 files changed, 21 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 2c4b08a6ec81..aa67869e3444 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1237,6 +1237,18 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
 	return err;
 }
 
+static inline bool fuse_permissible_uidgid(struct fuse_conn *fc)
+{
+	const struct cred *cred = current_cred();
+
+	return (uid_eq(cred->euid, fc->user_id) &&
+		uid_eq(cred->suid, fc->user_id) &&
+		uid_eq(cred->uid,  fc->user_id) &&
+		gid_eq(cred->egid, fc->group_id) &&
+		gid_eq(cred->sgid, fc->group_id) &&
+		gid_eq(cred->gid,  fc->group_id));
+}
+
 /*
  * Calling into a user-controlled filesystem gives the filesystem
  * daemon ptrace-like capabilities over the current process.  This
@@ -1250,26 +1262,19 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
  * for which the owner of the mount has ptrace privilege.  This
  * excludes processes started by other users, suid or sgid processes.
  */
-int fuse_allow_current_process(struct fuse_conn *fc)
+bool fuse_allow_current_process(struct fuse_conn *fc)
 {
-	const struct cred *cred;
-
-	if (allow_sys_admin_access && capable(CAP_SYS_ADMIN))
-		return 1;
+	bool allow;
 
 	if (fc->allow_other)
-		return current_in_userns(fc->user_ns);
+		allow = current_in_userns(fc->user_ns);
+	else
+		allow = fuse_permissible_uidgid(fc);
 
-	cred = current_cred();
-	if (uid_eq(cred->euid, fc->user_id) &&
-	    uid_eq(cred->suid, fc->user_id) &&
-	    uid_eq(cred->uid,  fc->user_id) &&
-	    gid_eq(cred->egid, fc->group_id) &&
-	    gid_eq(cred->sgid, fc->group_id) &&
-	    gid_eq(cred->gid,  fc->group_id))
-		return 1;
+	if (!allow && allow_sys_admin_access && capable(CAP_SYS_ADMIN))
+		allow = true;
 
-	return 0;
+	return allow;
 }
 
 static int fuse_access(struct inode *inode, int mask)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 8789d94eda21..d339b1ace887 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1179,7 +1179,7 @@ bool fuse_invalid_attr(struct fuse_attr *attr);
 /**
  * Is current process allowed to perform filesystem operation?
  */
-int fuse_allow_current_process(struct fuse_conn *fc);
+bool fuse_allow_current_process(struct fuse_conn *fc);
 
 u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id);
 
-- 
cgit 


From 8fe97d47b52ae1ad130470b1780f0ded4ba609a4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 20 Nov 2022 13:43:03 +0100
Subject: btrfs: use kvcalloc in btrfs_get_dev_zone_info

Otherwise the kernel memory allocator seems to be unhappy about failing
order 6 allocations for the zones array, that cause 100% reproducible
mount failures in my qemu setup:

  [26.078981] mount: page allocation failure: order:6, mode:0x40dc0(GFP_KERNEL|__GFP_COMP|__GFP_ZERO), nodemask=(null)
  [26.079741] CPU: 0 PID: 2965 Comm: mount Not tainted 6.1.0-rc5+ #185
  [26.080181] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
  [26.080950] Call Trace:
  [26.081132]  <TASK>
  [26.081291]  dump_stack_lvl+0x56/0x6f
  [26.081554]  warn_alloc+0x117/0x140
  [26.081808]  ? __alloc_pages_direct_compact+0x1b5/0x300
  [26.082174]  __alloc_pages_slowpath.constprop.0+0xd0e/0xde0
  [26.082569]  __alloc_pages+0x32a/0x340
  [26.082836]  __kmalloc_large_node+0x4d/0xa0
  [26.083133]  ? trace_kmalloc+0x29/0xd0
  [26.083399]  kmalloc_large+0x14/0x60
  [26.083654]  btrfs_get_dev_zone_info+0x1b9/0xc00
  [26.083980]  ? _raw_spin_unlock_irqrestore+0x28/0x50
  [26.084328]  btrfs_get_dev_zone_info_all_devices+0x54/0x80
  [26.084708]  open_ctree+0xed4/0x1654
  [26.084974]  btrfs_mount_root.cold+0x12/0xde
  [26.085288]  ? lock_is_held_type+0xe2/0x140
  [26.085603]  legacy_get_tree+0x28/0x50
  [26.085876]  vfs_get_tree+0x1d/0xb0
  [26.086139]  vfs_kern_mount.part.0+0x6c/0xb0
  [26.086456]  btrfs_mount+0x118/0x3a0
  [26.086728]  ? lock_is_held_type+0xe2/0x140
  [26.087043]  legacy_get_tree+0x28/0x50
  [26.087323]  vfs_get_tree+0x1d/0xb0
  [26.087587]  path_mount+0x2ba/0xbe0
  [26.087850]  ? _raw_spin_unlock_irqrestore+0x38/0x50
  [26.088217]  __x64_sys_mount+0xfe/0x140
  [26.088506]  do_syscall_64+0x35/0x80
  [26.088776]  entry_SYSCALL_64_after_hwframe+0x63/0xcd

Fixes: 5b316468983d ("btrfs: get zone information of zoned block devices")
CC: stable@vger.kernel.org # 5.15+
Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/zoned.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 44d8177eeffc..c9e2b0c85309 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -467,7 +467,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
 		goto out;
 	}
 
-	zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
+	zones = kvcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
 	if (!zones) {
 		ret = -ENOMEM;
 		goto out;
@@ -586,7 +586,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
 	}
 
 
-	kfree(zones);
+	kvfree(zones);
 
 	switch (bdev_zoned_model(bdev)) {
 	case BLK_ZONED_HM:
@@ -618,7 +618,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
 	return 0;
 
 out:
-	kfree(zones);
+	kvfree(zones);
 out_free_zone_info:
 	btrfs_destroy_dev_zone_info(device);
 
-- 
cgit 


From 796787c978efbbdb50e245718c784eb94f59eac4 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 21 Nov 2022 10:23:22 +0000
Subject: btrfs: do not modify log tree while holding a leaf from fs tree
 locked

When logging an inode in full mode, or when logging xattrs or when logging
the dir index items of a directory, we are modifying the log tree while
holding a read lock on a leaf from the fs/subvolume tree. This can lead to
a deadlock in rare circumstances, but it is a real possibility, and it was
recently reported by syzbot with the following trace from lockdep:

   WARNING: possible circular locking dependency detected
   6.1.0-rc5-next-20221116-syzkaller #0 Not tainted
   ------------------------------------------------------
   syz-executor.1/16154 is trying to acquire lock:
   ffff88807e3084a0 (&delayed_node->mutex){+.+.}-{3:3}, at: __btrfs_release_delayed_node.part.0+0xa1/0xf30 fs/btrfs/delayed-inode.c:256

   but task is already holding lock:
   ffff88807df33078 (btrfs-log-00){++++}-{3:3}, at: __btrfs_tree_lock+0x32/0x3d0 fs/btrfs/locking.c:197

   which lock already depends on the new lock.

   the existing dependency chain (in reverse order) is:

   -> #2 (btrfs-log-00){++++}-{3:3}:
          down_read_nested+0x9e/0x450 kernel/locking/rwsem.c:1634
          __btrfs_tree_read_lock+0x32/0x350 fs/btrfs/locking.c:135
          btrfs_tree_read_lock fs/btrfs/locking.c:141 [inline]
          btrfs_read_lock_root_node+0x82/0x3a0 fs/btrfs/locking.c:280
          btrfs_search_slot_get_root fs/btrfs/ctree.c:1678 [inline]
          btrfs_search_slot+0x3ca/0x2c70 fs/btrfs/ctree.c:1998
          btrfs_lookup_csum+0x116/0x3f0 fs/btrfs/file-item.c:209
          btrfs_csum_file_blocks+0x40e/0x1370 fs/btrfs/file-item.c:1021
          log_csums.isra.0+0x244/0x2d0 fs/btrfs/tree-log.c:4258
          copy_items.isra.0+0xbfb/0xed0 fs/btrfs/tree-log.c:4403
          copy_inode_items_to_log+0x13d6/0x1d90 fs/btrfs/tree-log.c:5873
          btrfs_log_inode+0xb19/0x4680 fs/btrfs/tree-log.c:6495
          btrfs_log_inode_parent+0x890/0x2a20 fs/btrfs/tree-log.c:6982
          btrfs_log_dentry_safe+0x59/0x80 fs/btrfs/tree-log.c:7083
          btrfs_sync_file+0xa41/0x13c0 fs/btrfs/file.c:1921
          vfs_fsync_range+0x13e/0x230 fs/sync.c:188
          generic_write_sync include/linux/fs.h:2856 [inline]
          iomap_dio_complete+0x73a/0x920 fs/iomap/direct-io.c:128
          btrfs_direct_write fs/btrfs/file.c:1536 [inline]
          btrfs_do_write_iter+0xba2/0x1470 fs/btrfs/file.c:1668
          call_write_iter include/linux/fs.h:2160 [inline]
          do_iter_readv_writev+0x20b/0x3b0 fs/read_write.c:735
          do_iter_write+0x182/0x700 fs/read_write.c:861
          vfs_iter_write+0x74/0xa0 fs/read_write.c:902
          iter_file_splice_write+0x745/0xc90 fs/splice.c:686
          do_splice_from fs/splice.c:764 [inline]
          direct_splice_actor+0x114/0x180 fs/splice.c:931
          splice_direct_to_actor+0x335/0x8a0 fs/splice.c:886
          do_splice_direct+0x1ab/0x280 fs/splice.c:974
          do_sendfile+0xb19/0x1270 fs/read_write.c:1255
          __do_sys_sendfile64 fs/read_write.c:1323 [inline]
          __se_sys_sendfile64 fs/read_write.c:1309 [inline]
          __x64_sys_sendfile64+0x259/0x2c0 fs/read_write.c:1309
          do_syscall_x64 arch/x86/entry/common.c:50 [inline]
          do_syscall_64+0x39/0xb0 arch/x86/entry/common.c:80
          entry_SYSCALL_64_after_hwframe+0x63/0xcd

   -> #1 (btrfs-tree-00){++++}-{3:3}:
          __lock_release kernel/locking/lockdep.c:5382 [inline]
          lock_release+0x371/0x810 kernel/locking/lockdep.c:5688
          up_write+0x2a/0x520 kernel/locking/rwsem.c:1614
          btrfs_tree_unlock_rw fs/btrfs/locking.h:189 [inline]
          btrfs_unlock_up_safe+0x1e3/0x290 fs/btrfs/locking.c:238
          search_leaf fs/btrfs/ctree.c:1832 [inline]
          btrfs_search_slot+0x265e/0x2c70 fs/btrfs/ctree.c:2074
          btrfs_insert_empty_items+0xbd/0x1c0 fs/btrfs/ctree.c:4133
          btrfs_insert_delayed_item+0x826/0xfa0 fs/btrfs/delayed-inode.c:746
          btrfs_insert_delayed_items fs/btrfs/delayed-inode.c:824 [inline]
          __btrfs_commit_inode_delayed_items fs/btrfs/delayed-inode.c:1111 [inline]
          __btrfs_run_delayed_items+0x280/0x590 fs/btrfs/delayed-inode.c:1153
          flush_space+0x147/0xe90 fs/btrfs/space-info.c:728
          btrfs_async_reclaim_metadata_space+0x541/0xc10 fs/btrfs/space-info.c:1086
          process_one_work+0x9bf/0x1710 kernel/workqueue.c:2289
          worker_thread+0x669/0x1090 kernel/workqueue.c:2436
          kthread+0x2e8/0x3a0 kernel/kthread.c:376
          ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308

   -> #0 (&delayed_node->mutex){+.+.}-{3:3}:
          check_prev_add kernel/locking/lockdep.c:3097 [inline]
          check_prevs_add kernel/locking/lockdep.c:3216 [inline]
          validate_chain kernel/locking/lockdep.c:3831 [inline]
          __lock_acquire+0x2a43/0x56d0 kernel/locking/lockdep.c:5055
          lock_acquire kernel/locking/lockdep.c:5668 [inline]
          lock_acquire+0x1e3/0x630 kernel/locking/lockdep.c:5633
          __mutex_lock_common kernel/locking/mutex.c:603 [inline]
          __mutex_lock+0x12f/0x1360 kernel/locking/mutex.c:747
          __btrfs_release_delayed_node.part.0+0xa1/0xf30 fs/btrfs/delayed-inode.c:256
          __btrfs_release_delayed_node fs/btrfs/delayed-inode.c:251 [inline]
          btrfs_release_delayed_node fs/btrfs/delayed-inode.c:281 [inline]
          btrfs_remove_delayed_node+0x52/0x60 fs/btrfs/delayed-inode.c:1285
          btrfs_evict_inode+0x511/0xf30 fs/btrfs/inode.c:5554
          evict+0x2ed/0x6b0 fs/inode.c:664
          dispose_list+0x117/0x1e0 fs/inode.c:697
          prune_icache_sb+0xeb/0x150 fs/inode.c:896
          super_cache_scan+0x391/0x590 fs/super.c:106
          do_shrink_slab+0x464/0xce0 mm/vmscan.c:843
          shrink_slab_memcg mm/vmscan.c:912 [inline]
          shrink_slab+0x388/0x660 mm/vmscan.c:991
          shrink_node_memcgs mm/vmscan.c:6088 [inline]
          shrink_node+0x93d/0x1f30 mm/vmscan.c:6117
          shrink_zones mm/vmscan.c:6355 [inline]
          do_try_to_free_pages+0x3b4/0x17a0 mm/vmscan.c:6417
          try_to_free_mem_cgroup_pages+0x3a4/0xa70 mm/vmscan.c:6732
          reclaim_high.constprop.0+0x182/0x230 mm/memcontrol.c:2393
          mem_cgroup_handle_over_high+0x190/0x520 mm/memcontrol.c:2578
          try_charge_memcg+0xe0c/0x12f0 mm/memcontrol.c:2816
          try_charge mm/memcontrol.c:2827 [inline]
          charge_memcg+0x90/0x3b0 mm/memcontrol.c:6889
          __mem_cgroup_charge+0x2b/0x90 mm/memcontrol.c:6910
          mem_cgroup_charge include/linux/memcontrol.h:667 [inline]
          __filemap_add_folio+0x615/0xf80 mm/filemap.c:852
          filemap_add_folio+0xaf/0x1e0 mm/filemap.c:934
          __filemap_get_folio+0x389/0xd80 mm/filemap.c:1976
          pagecache_get_page+0x2e/0x280 mm/folio-compat.c:104
          find_or_create_page include/linux/pagemap.h:612 [inline]
          alloc_extent_buffer+0x2b9/0x1580 fs/btrfs/extent_io.c:4588
          btrfs_init_new_buffer fs/btrfs/extent-tree.c:4869 [inline]
          btrfs_alloc_tree_block+0x2e1/0x1320 fs/btrfs/extent-tree.c:4988
          __btrfs_cow_block+0x3b2/0x1420 fs/btrfs/ctree.c:440
          btrfs_cow_block+0x2fa/0x950 fs/btrfs/ctree.c:595
          btrfs_search_slot+0x11b0/0x2c70 fs/btrfs/ctree.c:2038
          btrfs_update_root+0xdb/0x630 fs/btrfs/root-tree.c:137
          update_log_root fs/btrfs/tree-log.c:2841 [inline]
          btrfs_sync_log+0xbfb/0x2870 fs/btrfs/tree-log.c:3064
          btrfs_sync_file+0xdb9/0x13c0 fs/btrfs/file.c:1947
          vfs_fsync_range+0x13e/0x230 fs/sync.c:188
          generic_write_sync include/linux/fs.h:2856 [inline]
          iomap_dio_complete+0x73a/0x920 fs/iomap/direct-io.c:128
          btrfs_direct_write fs/btrfs/file.c:1536 [inline]
          btrfs_do_write_iter+0xba2/0x1470 fs/btrfs/file.c:1668
          call_write_iter include/linux/fs.h:2160 [inline]
          do_iter_readv_writev+0x20b/0x3b0 fs/read_write.c:735
          do_iter_write+0x182/0x700 fs/read_write.c:861
          vfs_iter_write+0x74/0xa0 fs/read_write.c:902
          iter_file_splice_write+0x745/0xc90 fs/splice.c:686
          do_splice_from fs/splice.c:764 [inline]
          direct_splice_actor+0x114/0x180 fs/splice.c:931
          splice_direct_to_actor+0x335/0x8a0 fs/splice.c:886
          do_splice_direct+0x1ab/0x280 fs/splice.c:974
          do_sendfile+0xb19/0x1270 fs/read_write.c:1255
          __do_sys_sendfile64 fs/read_write.c:1323 [inline]
          __se_sys_sendfile64 fs/read_write.c:1309 [inline]
          __x64_sys_sendfile64+0x259/0x2c0 fs/read_write.c:1309
          do_syscall_x64 arch/x86/entry/common.c:50 [inline]
          do_syscall_64+0x39/0xb0 arch/x86/entry/common.c:80
          entry_SYSCALL_64_after_hwframe+0x63/0xcd

   other info that might help us debug this:

   Chain exists of:
     &delayed_node->mutex --> btrfs-tree-00 --> btrfs-log-00

   Possible unsafe locking scenario:

          CPU0                    CPU1
          ----                    ----
     lock(btrfs-log-00);
                                  lock(btrfs-tree-00);
                                  lock(btrfs-log-00);
     lock(&delayed_node->mutex);

Holding a read lock on a leaf from a fs/subvolume tree creates a nasty
lock dependency when we are COWing extent buffers for the log tree and we
have two tasks modifying the log tree, with each one in one of the
following 2 scenarios:

1) Modifying the log tree triggers an extent buffer allocation while
   holding a write lock on a parent extent buffer from the log tree.
   Allocating the pages for an extent buffer, or the extent buffer
   struct, can trigger inode eviction and finally the inode eviction
   will trigger a release/remove of a delayed node, which requires
   taking the delayed node's mutex;

2) Allocating a metadata extent for a log tree can trigger the async
   reclaim thread and make us wait for it to release enough space and
   unblock our reservation ticket. The reclaim thread can start flushing
   delayed items, and that in turn results in the need to lock delayed
   node mutexes and in the need to write lock extent buffers of a
   subvolume tree - all this while holding a write lock on the parent
   extent buffer in the log tree.

So one task in scenario 1) running in parallel with another task in
scenario 2) could lead to a deadlock, one wanting to lock a delayed node
mutex while having a read lock on a leaf from the subvolume, while the
other is holding the delayed node's mutex and wants to write lock the same
subvolume leaf for flushing delayed items.

Fix this by cloning the leaf of the fs/subvolume tree, release/unlock the
fs/subvolume leaf and use the clone leaf instead.

Reported-by: syzbot+9b7c21f486f5e7f8d029@syzkaller.appspotmail.com
Link: https://lore.kernel.org/linux-btrfs/000000000000ccc93c05edc4d8cf@google.com/
CC: stable@vger.kernel.org # 6.0+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 55 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 813986e38258..c3cf3dabe0b1 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3694,15 +3694,29 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
 				  u64 *last_old_dentry_offset)
 {
 	struct btrfs_root *log = inode->root->log_root;
-	struct extent_buffer *src = path->nodes[0];
-	const int nritems = btrfs_header_nritems(src);
+	struct extent_buffer *src;
+	const int nritems = btrfs_header_nritems(path->nodes[0]);
 	const u64 ino = btrfs_ino(inode);
 	bool last_found = false;
 	int batch_start = 0;
 	int batch_size = 0;
 	int i;
 
-	for (i = path->slots[0]; i < nritems; i++) {
+	/*
+	 * We need to clone the leaf, release the read lock on it, and use the
+	 * clone before modifying the log tree. See the comment at copy_items()
+	 * about why we need to do this.
+	 */
+	src = btrfs_clone_extent_buffer(path->nodes[0]);
+	if (!src)
+		return -ENOMEM;
+
+	i = path->slots[0];
+	btrfs_release_path(path);
+	path->nodes[0] = src;
+	path->slots[0] = i;
+
+	for (; i < nritems; i++) {
 		struct btrfs_dir_item *di;
 		struct btrfs_key key;
 		int ret;
@@ -4303,7 +4317,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_root *log = inode->root->log_root;
 	struct btrfs_file_extent_item *extent;
-	struct extent_buffer *src = src_path->nodes[0];
+	struct extent_buffer *src;
 	int ret = 0;
 	struct btrfs_key *ins_keys;
 	u32 *ins_sizes;
@@ -4314,6 +4328,43 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 	const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM);
 	const u64 i_size = i_size_read(&inode->vfs_inode);
 
+	/*
+	 * To keep lockdep happy and avoid deadlocks, clone the source leaf and
+	 * use the clone. This is because otherwise we would be changing the log
+	 * tree, to insert items from the subvolume tree or insert csum items,
+	 * while holding a read lock on a leaf from the subvolume tree, which
+	 * creates a nasty lock dependency when COWing log tree nodes/leaves:
+	 *
+	 * 1) Modifying the log tree triggers an extent buffer allocation while
+	 *    holding a write lock on a parent extent buffer from the log tree.
+	 *    Allocating the pages for an extent buffer, or the extent buffer
+	 *    struct, can trigger inode eviction and finally the inode eviction
+	 *    will trigger a release/remove of a delayed node, which requires
+	 *    taking the delayed node's mutex;
+	 *
+	 * 2) Allocating a metadata extent for a log tree can trigger the async
+	 *    reclaim thread and make us wait for it to release enough space and
+	 *    unblock our reservation ticket. The reclaim thread can start
+	 *    flushing delayed items, and that in turn results in the need to
+	 *    lock delayed node mutexes and in the need to write lock extent
+	 *    buffers of a subvolume tree - all this while holding a write lock
+	 *    on the parent extent buffer in the log tree.
+	 *
+	 * So one task in scenario 1) running in parallel with another task in
+	 * scenario 2) could lead to a deadlock, one wanting to lock a delayed
+	 * node mutex while having a read lock on a leaf from the subvolume,
+	 * while the other is holding the delayed node's mutex and wants to
+	 * write lock the same subvolume leaf for flushing delayed items.
+	 */
+	src = btrfs_clone_extent_buffer(src_path->nodes[0]);
+	if (!src)
+		return -ENOMEM;
+
+	i = src_path->slots[0];
+	btrfs_release_path(src_path);
+	src_path->nodes[0] = src;
+	src_path->slots[0] = i;
+
 	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
 			   nr * sizeof(u32), GFP_NOFS);
 	if (!ins_data)
-- 
cgit 


From ffdbb44f2f23f963b8f5672e35c3a26088177a62 Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Tue, 22 Nov 2022 19:50:02 +0800
Subject: btrfs: sysfs: normalize the error handling branch in
 btrfs_init_sysfs()

Although kset_unregister() can eventually remove all attribute files,
explicitly rolling back with the matching function makes the code logic
look clearer.

CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/sysfs.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 699b54b3acaa..74fef1f49c35 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -2321,8 +2321,11 @@ int __init btrfs_init_sysfs(void)
 
 #ifdef CONFIG_BTRFS_DEBUG
 	ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_debug_feature_attr_group);
-	if (ret)
-		goto out2;
+	if (ret) {
+		sysfs_unmerge_group(&btrfs_kset->kobj,
+				    &btrfs_static_feature_attr_group);
+		goto out_remove_group;
+	}
 #endif
 
 	return 0;
-- 
cgit 


From ac8db824ead0de2e9111337c401409d010fba2f0 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 23 Nov 2022 14:14:32 -0500
Subject: NFSD: Fix reads with a non-zero offset that don't end on a page
 boundary

This was found when virtual machines with nfs-mounted qcow2 disks
failed to boot properly.

Reported-by: Anders Blomdell <anders.blomdell@control.lth.se>
Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Link: https://bugzilla.redhat.com/show_bug.cgi?id=2142132
Fixes: bfbfb6182ad1 ("nfsd_splice_actor(): handle compound pages")
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/vfs.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 83be89905cbf..31bc7cc82439 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -871,10 +871,11 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 	struct svc_rqst *rqstp = sd->u.data;
 	struct page *page = buf->page;	// may be a compound one
 	unsigned offset = buf->offset;
+	struct page *last_page;
 
-	page += offset / PAGE_SIZE;
-	for (int i = sd->len; i > 0; i -= PAGE_SIZE)
-		svc_rqst_replace_page(rqstp, page++);
+	last_page = page + (offset + sd->len - 1) / PAGE_SIZE;
+	for (page += offset / PAGE_SIZE; page <= last_page; page++)
+		svc_rqst_replace_page(rqstp, page);
 	if (rqstp->rq_res.page_len == 0)	// first call
 		rqstp->rq_res.page_base = offset % PAGE_SIZE;
 	rqstp->rq_res.page_len += sd->len;
-- 
cgit 


From 61d8e42667716f71f2c26e327e66f2940d809f80 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 24 Nov 2022 22:55:57 -0500
Subject: copy_mnt_ns(): handle a corner case (overmounted mntns bindings)
 saner

copy_mnt_ns() has the old tree copied, with mntns binding *and* anything
bound on top of them skipped.  Then it proceeds to walk both trees in
parallel.  Unfortunately, it doesn't get the "skip the stuff we'd skipped
when copying" quite right.  Consequences are minor (the ->mnt_root
comparison will return the situation to sanity pretty soon and the worst
we get is the unexpected subset of opened non-directories being switched
to new namespace), but it's confusing enough and it's not hard to get
the expected behaviour...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index df137ba19d37..c80f422084eb 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3515,8 +3515,9 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
 		q = next_mnt(q, new);
 		if (!q)
 			break;
+		// an mntns binding we'd skipped?
 		while (p->mnt.mnt_root != q->mnt.mnt_root)
-			p = next_mnt(p, old);
+			p = next_mnt(skip_mnt_tree(p), old);
 	}
 	namespace_unlock();
 
-- 
cgit 


From bdbadfcc37c5c8f9f2a401a18eae71b0c28799ee Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 3 Sep 2022 20:23:56 -0400
Subject: [elf][non-regset] uninline elf_core_copy_task_fpregs() (and lose
 pt_regs argument)

Don't bother with pointless macros - we are not sharing it with aout coredumps
anymore.  Just convert the underlying functions to the same arguments (nobody
uses regs, actually) and call them elf_core_copy_task_fpregs().  And unexport
the entire bunch, while we are at it.

[added missing includes in arch/{csky,m68k,um}/kernel/process.c to avoid extra
warnings about the lack of externs getting added to huge piles for those
files.  Pointless, but...]

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/binfmt_elf.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index e990075fb43d..c3c5bd48361e 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -2001,8 +2001,7 @@ static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
 	t->num_notes++;
 	sz += notesize(&t->notes[0]);
 
-	if ((t->prstatus.pr_fpvalid = elf_core_copy_task_fpregs(p, NULL,
-								&t->fpu))) {
+	if ((t->prstatus.pr_fpvalid = elf_core_copy_task_fpregs(p, &t->fpu))) {
 		fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu),
 			  &(t->fpu));
 		t->num_notes++;
@@ -2100,7 +2099,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 
 	/* Try to dump the FPU. */
 	info->prstatus->pr_fpvalid =
-		elf_core_copy_task_fpregs(current, task_pt_regs(current), info->fpu);
+		elf_core_copy_task_fpregs(current, info->fpu);
 	if (info->prstatus->pr_fpvalid)
 		fill_note(info->notes + info->numnote++,
 			  "CORE", NT_PRFPREG, sizeof(*info->fpu), info->fpu);
-- 
cgit 


From e961d370fc7b806a0d668413a9f1b006760eaee6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 4 Sep 2022 17:17:26 -0400
Subject: [elf][non-regset] use elf_core_copy_task_regs() for dumper as well

elf_core_copy_regs() is equivalent to elf_core_copy_task_regs() of
current on all architectures.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/binfmt_elf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index c3c5bd48361e..cb95e842c50f 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -2072,7 +2072,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 	/* now collect the dump for the current */
 	memset(info->prstatus, 0, sizeof(*info->prstatus));
 	fill_prstatus(&info->prstatus->common, current, cprm->siginfo->si_signo);
-	elf_core_copy_regs(&info->prstatus->pr_reg, task_pt_regs(current));
+	elf_core_copy_task_regs(current, &info->prstatus->pr_reg);
 
 	/* Set up header */
 	fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS);
-- 
cgit 


From e92edb85d87e9f8c8bead9cdb7cb2da4b51fd7f8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 5 Sep 2022 00:41:05 -0400
Subject: [elf] unify regset and non-regset cases

The only real difference is in filling per-thread notes - getting
the values of registers.   And this is the only part that is worth
an ifdef - we don't need to duplicate the logics regarding gathering
threads, filling other notes, etc.

It would've been hard to do back when regset-based variant had been
introduced, mostly due to sharing bits and pieces of helpers with
aout coredumps.  As the result, too much had been duplicated and
the copies had drifted away since then.  Now it can be done cleanly...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/binfmt_elf.c | 230 ++++++++++----------------------------------------------
 1 file changed, 38 insertions(+), 192 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index cb95e842c50f..3758a617c9d8 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1723,7 +1723,6 @@ static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm
 	return 0;
 }
 
-#ifdef CORE_DUMP_USE_REGSET
 #include <linux/regset.h>
 
 struct elf_thread_core_info {
@@ -1744,6 +1743,7 @@ struct elf_note_info {
 	int thread_notes;
 };
 
+#ifdef CORE_DUMP_USE_REGSET
 /*
  * When a regset has a writeback hook, we call it on each thread before
  * dumping user memory.  On register window machines, this makes sure the
@@ -1823,13 +1823,41 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 
 	return 1;
 }
+#else
+static int fill_thread_core_info(struct elf_thread_core_info *t,
+				 const struct user_regset_view *view,
+				 long signr, struct elf_note_info *info)
+{
+	struct task_struct *p = t->task;
+	elf_fpregset_t *fpu;
+
+	fill_prstatus(&t->prstatus.common, p, signr);
+	elf_core_copy_task_regs(p, &t->prstatus.pr_reg);
+
+	fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus),
+		  &(t->prstatus));
+	info->size += notesize(&t->notes[0]);
+
+	fpu = kzalloc(sizeof(elf_fpregset_t), GFP_KERNEL);
+	if (!fpu || !elf_core_copy_task_fpregs(p, fpu)) {
+		kfree(fpu);
+		return 1;
+	}
+
+	t->prstatus.pr_fpvalid = 1;
+	fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(*fpu), fpu);
+	info->size += notesize(&t->notes[1]);
+
+	return 1;
+}
+#endif
 
 static int fill_note_info(struct elfhdr *elf, int phdrs,
 			  struct elf_note_info *info,
 			  struct coredump_params *cprm)
 {
 	struct task_struct *dump_task = current;
-	const struct user_regset_view *view = task_user_regset_view(dump_task);
+	const struct user_regset_view *view;
 	struct elf_thread_core_info *t;
 	struct elf_prpsinfo *psinfo;
 	struct core_thread *ct;
@@ -1839,6 +1867,9 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 		return 0;
 	fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
 
+#ifdef CORE_DUMP_USE_REGSET
+	view = task_user_regset_view(dump_task);
+
 	/*
 	 * Figure out how many notes we're going to need for each thread.
 	 */
@@ -1862,6 +1893,11 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 	 */
 	fill_elf_header(elf, phdrs,
 			view->e_machine, view->e_flags);
+#else
+	view = NULL;
+	info->thread_notes = 2;
+	fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS);
+#endif
 
 	/*
 	 * Allocate a structure for each thread.
@@ -1969,196 +2005,6 @@ static void free_note_info(struct elf_note_info *info)
 	kvfree(info->files.data);
 }
 
-#else
-
-/* Here is the structure in which status of each thread is captured. */
-struct elf_thread_status
-{
-	struct list_head list;
-	struct elf_prstatus prstatus;	/* NT_PRSTATUS */
-	elf_fpregset_t fpu;		/* NT_PRFPREG */
-	struct task_struct *thread;
-	struct memelfnote notes[3];
-	int num_notes;
-};
-
-/*
- * In order to add the specific thread information for the elf file format,
- * we need to keep a linked list of every threads pr_status and then create
- * a single section for them in the final core file.
- */
-static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
-{
-	int sz = 0;
-	struct task_struct *p = t->thread;
-	t->num_notes = 0;
-
-	fill_prstatus(&t->prstatus.common, p, signr);
-	elf_core_copy_task_regs(p, &t->prstatus.pr_reg);	
-	
-	fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus),
-		  &(t->prstatus));
-	t->num_notes++;
-	sz += notesize(&t->notes[0]);
-
-	if ((t->prstatus.pr_fpvalid = elf_core_copy_task_fpregs(p, &t->fpu))) {
-		fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu),
-			  &(t->fpu));
-		t->num_notes++;
-		sz += notesize(&t->notes[1]);
-	}
-	return sz;
-}
-
-struct elf_note_info {
-	struct memelfnote *notes;
-	struct memelfnote *notes_files;
-	struct elf_prstatus *prstatus;	/* NT_PRSTATUS */
-	struct elf_prpsinfo *psinfo;	/* NT_PRPSINFO */
-	struct list_head thread_list;
-	elf_fpregset_t *fpu;
-	user_siginfo_t csigdata;
-	int thread_status_size;
-	int numnote;
-};
-
-static int elf_note_info_init(struct elf_note_info *info)
-{
-	memset(info, 0, sizeof(*info));
-	INIT_LIST_HEAD(&info->thread_list);
-
-	/* Allocate space for ELF notes */
-	info->notes = kmalloc_array(8, sizeof(struct memelfnote), GFP_KERNEL);
-	if (!info->notes)
-		return 0;
-	info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL);
-	if (!info->psinfo)
-		return 0;
-	info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL);
-	if (!info->prstatus)
-		return 0;
-	info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL);
-	if (!info->fpu)
-		return 0;
-	return 1;
-}
-
-static int fill_note_info(struct elfhdr *elf, int phdrs,
-			  struct elf_note_info *info,
-			  struct coredump_params *cprm)
-{
-	struct core_thread *ct;
-	struct elf_thread_status *ets;
-
-	if (!elf_note_info_init(info))
-		return 0;
-
-	for (ct = current->signal->core_state->dumper.next;
-					ct; ct = ct->next) {
-		ets = kzalloc(sizeof(*ets), GFP_KERNEL);
-		if (!ets)
-			return 0;
-
-		ets->thread = ct->task;
-		list_add(&ets->list, &info->thread_list);
-	}
-
-	list_for_each_entry(ets, &info->thread_list, list) {
-		int sz;
-
-		sz = elf_dump_thread_status(cprm->siginfo->si_signo, ets);
-		info->thread_status_size += sz;
-	}
-	/* now collect the dump for the current */
-	memset(info->prstatus, 0, sizeof(*info->prstatus));
-	fill_prstatus(&info->prstatus->common, current, cprm->siginfo->si_signo);
-	elf_core_copy_task_regs(current, &info->prstatus->pr_reg);
-
-	/* Set up header */
-	fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS);
-
-	/*
-	 * Set up the notes in similar form to SVR4 core dumps made
-	 * with info from their /proc.
-	 */
-
-	fill_note(info->notes + 0, "CORE", NT_PRSTATUS,
-		  sizeof(*info->prstatus), info->prstatus);
-	fill_psinfo(info->psinfo, current->group_leader, current->mm);
-	fill_note(info->notes + 1, "CORE", NT_PRPSINFO,
-		  sizeof(*info->psinfo), info->psinfo);
-
-	fill_siginfo_note(info->notes + 2, &info->csigdata, cprm->siginfo);
-	fill_auxv_note(info->notes + 3, current->mm);
-	info->numnote = 4;
-
-	if (fill_files_note(info->notes + info->numnote, cprm) == 0) {
-		info->notes_files = info->notes + info->numnote;
-		info->numnote++;
-	}
-
-	/* Try to dump the FPU. */
-	info->prstatus->pr_fpvalid =
-		elf_core_copy_task_fpregs(current, info->fpu);
-	if (info->prstatus->pr_fpvalid)
-		fill_note(info->notes + info->numnote++,
-			  "CORE", NT_PRFPREG, sizeof(*info->fpu), info->fpu);
-	return 1;
-}
-
-static size_t get_note_info_size(struct elf_note_info *info)
-{
-	int sz = 0;
-	int i;
-
-	for (i = 0; i < info->numnote; i++)
-		sz += notesize(info->notes + i);
-
-	sz += info->thread_status_size;
-
-	return sz;
-}
-
-static int write_note_info(struct elf_note_info *info,
-			   struct coredump_params *cprm)
-{
-	struct elf_thread_status *ets;
-	int i;
-
-	for (i = 0; i < info->numnote; i++)
-		if (!writenote(info->notes + i, cprm))
-			return 0;
-
-	/* write out the thread status notes section */
-	list_for_each_entry(ets, &info->thread_list, list) {
-		for (i = 0; i < ets->num_notes; i++)
-			if (!writenote(&ets->notes[i], cprm))
-				return 0;
-	}
-
-	return 1;
-}
-
-static void free_note_info(struct elf_note_info *info)
-{
-	while (!list_empty(&info->thread_list)) {
-		struct list_head *tmp = info->thread_list.next;
-		list_del(tmp);
-		kfree(list_entry(tmp, struct elf_thread_status, list));
-	}
-
-	/* Free data possibly allocated by fill_files_note(): */
-	if (info->notes_files)
-		kvfree(info->notes_files->data);
-
-	kfree(info->prstatus);
-	kfree(info->psinfo);
-	kfree(info->notes);
-	kfree(info->fpu);
-}
-
-#endif
-
 static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
 			     elf_addr_t e_shoff, int segs)
 {
-- 
cgit 


From 38ba2f11d9ce0e7c9444e57cb1bb418d1979534b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 4 Sep 2022 23:24:11 -0400
Subject: [elf] get rid of get_note_info_size()

it's trivial now...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/binfmt_elf.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 3758a617c9d8..65d2d2ba4d9e 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1946,11 +1946,6 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 	return 1;
 }
 
-static size_t get_note_info_size(struct elf_note_info *info)
-{
-	return info->size;
-}
-
 /*
  * Write all the notes for each thread.  When writing the first thread, the
  * process-wide notes are interleaved after the first thread-specific note.
@@ -2068,7 +2063,7 @@ static int elf_core_dump(struct coredump_params *cprm)
 
 	/* Write notes phdr entry */
 	{
-		size_t sz = get_note_info_size(&info);
+		size_t sz = info.size;
 
 		/* For cell spufs */
 		sz += elf_coredump_extra_notes_size();
-- 
cgit 


From cda2ed05aade303b7d89844a0333168c3484634a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 31 Oct 2022 13:46:26 +0100
Subject: fs: simplify vfs_get_super

Remove the pointless keying argument and associated enum and pass the
fill_super callback and a "bool reconf" instead.  Also mark the function
static given that there are no users outside of super.c.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/super.c | 60 +++++++++---------------------------------------------------
 1 file changed, 9 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/super.c b/fs/super.c
index 6a82660e1adb..76f477c24c3d 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1111,55 +1111,14 @@ static int test_single_super(struct super_block *s, struct fs_context *fc)
 	return 1;
 }
 
-/**
- * vfs_get_super - Get a superblock with a search key set in s_fs_info.
- * @fc: The filesystem context holding the parameters
- * @keying: How to distinguish superblocks
- * @fill_super: Helper to initialise a new superblock
- *
- * Search for a superblock and create a new one if not found.  The search
- * criterion is controlled by @keying.  If the search fails, a new superblock
- * is created and @fill_super() is called to initialise it.
- *
- * @keying can take one of a number of values:
- *
- * (1) vfs_get_single_super - Only one superblock of this type may exist on the
- *     system.  This is typically used for special system filesystems.
- *
- * (2) vfs_get_keyed_super - Multiple superblocks may exist, but they must have
- *     distinct keys (where the key is in s_fs_info).  Searching for the same
- *     key again will turn up the superblock for that key.
- *
- * (3) vfs_get_independent_super - Multiple superblocks may exist and are
- *     unkeyed.  Each call will get a new superblock.
- *
- * A permissions check is made by sget_fc() unless we're getting a superblock
- * for a kernel-internal mount or a submount.
- */
-int vfs_get_super(struct fs_context *fc,
-		  enum vfs_get_super_keying keying,
-		  int (*fill_super)(struct super_block *sb,
-				    struct fs_context *fc))
+static int vfs_get_super(struct fs_context *fc, bool reconf,
+		int (*test)(struct super_block *, struct fs_context *),
+		int (*fill_super)(struct super_block *sb,
+				  struct fs_context *fc))
 {
-	int (*test)(struct super_block *, struct fs_context *);
 	struct super_block *sb;
 	int err;
 
-	switch (keying) {
-	case vfs_get_single_super:
-	case vfs_get_single_reconf_super:
-		test = test_single_super;
-		break;
-	case vfs_get_keyed_super:
-		test = test_keyed_super;
-		break;
-	case vfs_get_independent_super:
-		test = NULL;
-		break;
-	default:
-		BUG();
-	}
-
 	sb = sget_fc(fc, test, set_anon_super_fc);
 	if (IS_ERR(sb))
 		return PTR_ERR(sb);
@@ -1173,7 +1132,7 @@ int vfs_get_super(struct fs_context *fc,
 		fc->root = dget(sb->s_root);
 	} else {
 		fc->root = dget(sb->s_root);
-		if (keying == vfs_get_single_reconf_super) {
+		if (reconf) {
 			err = reconfigure_super(fc);
 			if (err < 0) {
 				dput(fc->root);
@@ -1189,13 +1148,12 @@ error:
 	deactivate_locked_super(sb);
 	return err;
 }
-EXPORT_SYMBOL(vfs_get_super);
 
 int get_tree_nodev(struct fs_context *fc,
 		  int (*fill_super)(struct super_block *sb,
 				    struct fs_context *fc))
 {
-	return vfs_get_super(fc, vfs_get_independent_super, fill_super);
+	return vfs_get_super(fc, false, NULL, fill_super);
 }
 EXPORT_SYMBOL(get_tree_nodev);
 
@@ -1203,7 +1161,7 @@ int get_tree_single(struct fs_context *fc,
 		  int (*fill_super)(struct super_block *sb,
 				    struct fs_context *fc))
 {
-	return vfs_get_super(fc, vfs_get_single_super, fill_super);
+	return vfs_get_super(fc, false, test_single_super, fill_super);
 }
 EXPORT_SYMBOL(get_tree_single);
 
@@ -1211,7 +1169,7 @@ int get_tree_single_reconf(struct fs_context *fc,
 		  int (*fill_super)(struct super_block *sb,
 				    struct fs_context *fc))
 {
-	return vfs_get_super(fc, vfs_get_single_reconf_super, fill_super);
+	return vfs_get_super(fc, true, test_single_super, fill_super);
 }
 EXPORT_SYMBOL(get_tree_single_reconf);
 
@@ -1221,7 +1179,7 @@ int get_tree_keyed(struct fs_context *fc,
 		void *key)
 {
 	fc->s_fs_info = key;
-	return vfs_get_super(fc, vfs_get_keyed_super, fill_super);
+	return vfs_get_super(fc, false, test_keyed_super, fill_super);
 }
 EXPORT_SYMBOL(get_tree_keyed);
 
-- 
cgit 


From 10bc8e4af65946b727728d7479c028742321b60a Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Thu, 17 Nov 2022 22:52:49 +0200
Subject: vfs: fix copy_file_range() averts filesystem freeze protection

Commit 868f9f2f8e00 ("vfs: fix copy_file_range() regression in cross-fs
copies") removed fallback to generic_copy_file_range() for cross-fs
cases inside vfs_copy_file_range().

To preserve behavior of nfsd and ksmbd server-side-copy, the fallback to
generic_copy_file_range() was added in nfsd and ksmbd code, but that
call is missing sb_start_write(), fsnotify hooks and more.

Ideally, nfsd and ksmbd would pass a flag to vfs_copy_file_range() that
will take care of the fallback, but that code would be subtle and we got
vfs_copy_file_range() logic wrong too many times already.

Instead, add a flag to explicitly request vfs_copy_file_range() to
perform only generic_copy_file_range() and let nfsd and ksmbd use this
flag only in the fallback path.

This choise keeps the logic changes to minimum in the non-nfsd/ksmbd code
paths to reduce the risk of further regressions.

Fixes: 868f9f2f8e00 ("vfs: fix copy_file_range() regression in cross-fs copies")
Tested-by: Namjae Jeon <linkinjeon@kernel.org>
Tested-by: Luis Henriques <lhenriques@suse.de>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ksmbd/vfs.c  |  6 +++---
 fs/nfsd/vfs.c   |  4 ++--
 fs/read_write.c | 19 +++++++++++++++----
 3 files changed, 20 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c
index 8de970d6146f..94b8ed4ef870 100644
--- a/fs/ksmbd/vfs.c
+++ b/fs/ksmbd/vfs.c
@@ -1794,9 +1794,9 @@ int ksmbd_vfs_copy_file_ranges(struct ksmbd_work *work,
 		ret = vfs_copy_file_range(src_fp->filp, src_off,
 					  dst_fp->filp, dst_off, len, 0);
 		if (ret == -EOPNOTSUPP || ret == -EXDEV)
-			ret = generic_copy_file_range(src_fp->filp, src_off,
-						      dst_fp->filp, dst_off,
-						      len, 0);
+			ret = vfs_copy_file_range(src_fp->filp, src_off,
+						  dst_fp->filp, dst_off, len,
+						  COPY_FILE_SPLICE);
 		if (ret < 0)
 			return ret;
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index f650afedd67f..5cf11cde51f8 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -596,8 +596,8 @@ ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
 	ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
 
 	if (ret == -EOPNOTSUPP || ret == -EXDEV)
-		ret = generic_copy_file_range(src, src_pos, dst, dst_pos,
-					      count, 0);
+		ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count,
+					  COPY_FILE_SPLICE);
 	return ret;
 }
 
diff --git a/fs/read_write.c b/fs/read_write.c
index 328ce8cf9a85..24b9668d6377 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1388,6 +1388,8 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
 				struct file *file_out, loff_t pos_out,
 				size_t len, unsigned int flags)
 {
+	lockdep_assert(sb_write_started(file_inode(file_out)->i_sb));
+
 	return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
 				len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
 }
@@ -1424,7 +1426,9 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
 	 * and several different sets of file_operations, but they all end up
 	 * using the same ->copy_file_range() function pointer.
 	 */
-	if (file_out->f_op->copy_file_range) {
+	if (flags & COPY_FILE_SPLICE) {
+		/* cross sb splice is allowed */
+	} else if (file_out->f_op->copy_file_range) {
 		if (file_in->f_op->copy_file_range !=
 		    file_out->f_op->copy_file_range)
 			return -EXDEV;
@@ -1474,8 +1478,9 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
 			    size_t len, unsigned int flags)
 {
 	ssize_t ret;
+	bool splice = flags & COPY_FILE_SPLICE;
 
-	if (flags != 0)
+	if (flags & ~COPY_FILE_SPLICE)
 		return -EINVAL;
 
 	ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len,
@@ -1501,14 +1506,14 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
 	 * same sb using clone, but for filesystems where both clone and copy
 	 * are supported (e.g. nfs,cifs), we only call the copy method.
 	 */
-	if (file_out->f_op->copy_file_range) {
+	if (!splice && file_out->f_op->copy_file_range) {
 		ret = file_out->f_op->copy_file_range(file_in, pos_in,
 						      file_out, pos_out,
 						      len, flags);
 		goto done;
 	}
 
-	if (file_in->f_op->remap_file_range &&
+	if (!splice && file_in->f_op->remap_file_range &&
 	    file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) {
 		ret = file_in->f_op->remap_file_range(file_in, pos_in,
 				file_out, pos_out,
@@ -1528,6 +1533,8 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
 	 * consistent story about which filesystems support copy_file_range()
 	 * and which filesystems do not, that will allow userspace tools to
 	 * make consistent desicions w.r.t using copy_file_range().
+	 *
+	 * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE.
 	 */
 	ret = generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
 				      flags);
@@ -1582,6 +1589,10 @@ SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
 		pos_out = f_out.file->f_pos;
 	}
 
+	ret = -EINVAL;
+	if (flags != 0)
+		goto out;
+
 	ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
 				  flags);
 	if (ret > 0) {
-- 
cgit 


From db58653ce0c7cf4d155727852607106f890005c0 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Date: Mon, 21 Nov 2022 16:29:37 +0900
Subject: zonefs: Fix active zone accounting

If a file zone transitions to the offline or readonly state from an
active state, we must clear the zone active flag and decrement the
active seq file counter. Do so in zonefs_account_active() using the new
zonefs inode flags ZONEFS_ZONE_OFFLINE and ZONEFS_ZONE_READONLY. These
flags are set if necessary in zonefs_check_zone_condition() based on the
result of report zones operation after an IO error.

Fixes: 87c9ce3ffec9 ("zonefs: Add active seq file accounting")
Cc: stable@vger.kernel.org
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
 fs/zonefs/super.c  | 11 +++++++++++
 fs/zonefs/zonefs.h |  6 ++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index f0e8a000f073..2c53fbb8d918 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -40,6 +40,13 @@ static void zonefs_account_active(struct inode *inode)
 	if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
 		return;
 
+	/*
+	 * For zones that transitioned to the offline or readonly condition,
+	 * we only need to clear the active state.
+	 */
+	if (zi->i_flags & (ZONEFS_ZONE_OFFLINE | ZONEFS_ZONE_READONLY))
+		goto out;
+
 	/*
 	 * If the zone is active, that is, if it is explicitly open or
 	 * partially written, check if it was already accounted as active.
@@ -53,6 +60,7 @@ static void zonefs_account_active(struct inode *inode)
 		return;
 	}
 
+out:
 	/* The zone is not active. If it was, update the active count */
 	if (zi->i_flags & ZONEFS_ZONE_ACTIVE) {
 		zi->i_flags &= ~ZONEFS_ZONE_ACTIVE;
@@ -324,6 +332,7 @@ static loff_t zonefs_check_zone_condition(struct inode *inode,
 		inode->i_flags |= S_IMMUTABLE;
 		inode->i_mode &= ~0777;
 		zone->wp = zone->start;
+		zi->i_flags |= ZONEFS_ZONE_OFFLINE;
 		return 0;
 	case BLK_ZONE_COND_READONLY:
 		/*
@@ -342,8 +351,10 @@ static loff_t zonefs_check_zone_condition(struct inode *inode,
 			zone->cond = BLK_ZONE_COND_OFFLINE;
 			inode->i_mode &= ~0777;
 			zone->wp = zone->start;
+			zi->i_flags |= ZONEFS_ZONE_OFFLINE;
 			return 0;
 		}
+		zi->i_flags |= ZONEFS_ZONE_READONLY;
 		inode->i_mode &= ~0222;
 		return i_size_read(inode);
 	case BLK_ZONE_COND_FULL:
diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h
index 4b3de66c3233..1dbe78119ff1 100644
--- a/fs/zonefs/zonefs.h
+++ b/fs/zonefs/zonefs.h
@@ -39,8 +39,10 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone)
 	return ZONEFS_ZTYPE_SEQ;
 }
 
-#define ZONEFS_ZONE_OPEN	(1 << 0)
-#define ZONEFS_ZONE_ACTIVE	(1 << 1)
+#define ZONEFS_ZONE_OPEN	(1U << 0)
+#define ZONEFS_ZONE_ACTIVE	(1U << 1)
+#define ZONEFS_ZONE_OFFLINE	(1U << 2)
+#define ZONEFS_ZONE_READONLY	(1U << 3)
 
 /*
  * In-memory inode data.
-- 
cgit 


From de4eda9de2d957ef2d6a8365a01e26a435e958cb Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 15 Sep 2022 20:25:47 -0400
Subject: use less confusing names for iov_iter direction initializers

READ/WRITE proved to be actively confusing - the meanings are
"data destination, as used with read(2)" and "data source, as
used with write(2)", but people keep interpreting those as
"we read data from it" and "we write data to it", i.e. exactly
the wrong way.

Call them ITER_DEST and ITER_SOURCE - at least that is harder
to misinterpret...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/9p/vfs_addr.c       |  4 ++--
 fs/9p/vfs_dir.c        |  2 +-
 fs/9p/xattr.c          |  4 ++--
 fs/afs/cmservice.c     |  2 +-
 fs/afs/dir.c           |  2 +-
 fs/afs/file.c          |  4 ++--
 fs/afs/internal.h      |  4 ++--
 fs/afs/rxrpc.c         | 10 +++++-----
 fs/afs/write.c         |  4 ++--
 fs/aio.c               |  4 ++--
 fs/btrfs/ioctl.c       |  4 ++--
 fs/ceph/addr.c         |  4 ++--
 fs/ceph/file.c         |  4 ++--
 fs/cifs/connect.c      |  6 +++---
 fs/cifs/file.c         |  4 ++--
 fs/cifs/fscache.c      |  4 ++--
 fs/cifs/smb2ops.c      |  4 ++--
 fs/cifs/transport.c    |  6 +++---
 fs/coredump.c          |  2 +-
 fs/erofs/fscache.c     |  6 +++---
 fs/fscache/io.c        |  2 +-
 fs/fuse/ioctl.c        |  4 ++--
 fs/netfs/io.c          |  6 +++---
 fs/nfs/fscache.c       |  4 ++--
 fs/nfsd/vfs.c          |  4 ++--
 fs/ocfs2/cluster/tcp.c |  2 +-
 fs/orangefs/inode.c    |  8 ++++----
 fs/proc/vmcore.c       |  6 +++---
 fs/read_write.c        | 12 ++++++------
 fs/seq_file.c          |  2 +-
 fs/splice.c            | 10 +++++-----
 31 files changed, 72 insertions(+), 72 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 47b9a1122f34..a19891015f19 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -40,7 +40,7 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
 	size_t len = subreq->len   - subreq->transferred;
 	int total, err;
 
-	iov_iter_xarray(&to, READ, &rreq->mapping->i_pages, pos, len);
+	iov_iter_xarray(&to, ITER_DEST, &rreq->mapping->i_pages, pos, len);
 
 	total = p9_client_read(fid, pos, &to, &err);
 
@@ -172,7 +172,7 @@ static int v9fs_vfs_write_folio_locked(struct folio *folio)
 
 	len = min_t(loff_t, i_size - start, len);
 
-	iov_iter_xarray(&from, WRITE, &folio_mapping(folio)->i_pages, start, len);
+	iov_iter_xarray(&from, ITER_SOURCE, &folio_mapping(folio)->i_pages, start, len);
 
 	/* We should have writeback_fid always set */
 	BUG_ON(!v9inode->writeback_fid);
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 000fbaae9b18..3bb95adc9619 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -109,7 +109,7 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
 			struct iov_iter to;
 			int n;
 
-			iov_iter_kvec(&to, READ, &kvec, 1, buflen);
+			iov_iter_kvec(&to, ITER_DEST, &kvec, 1, buflen);
 			n = p9_client_read(file->private_data, ctx->pos, &to,
 					   &err);
 			if (err)
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index 1f9298a4bd42..2807bb63f780 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -24,7 +24,7 @@ ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
 	struct iov_iter to;
 	int err;
 
-	iov_iter_kvec(&to, READ, &kvec, 1, buffer_size);
+	iov_iter_kvec(&to, ITER_DEST, &kvec, 1, buffer_size);
 
 	attr_fid = p9_client_xattrwalk(fid, name, &attr_size);
 	if (IS_ERR(attr_fid)) {
@@ -109,7 +109,7 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name,
 	struct iov_iter from;
 	int retval, err;
 
-	iov_iter_kvec(&from, WRITE, &kvec, 1, value_len);
+	iov_iter_kvec(&from, ITER_SOURCE, &kvec, 1, value_len);
 
 	p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu flags = %d\n",
 		 name, value_len, flags);
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 0a090d614e76..7dcd59693a0c 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -298,7 +298,7 @@ static int afs_deliver_cb_callback(struct afs_call *call)
 		if (call->count2 != call->count && call->count2 != 0)
 			return afs_protocol_error(call, afs_eproto_cb_count);
 		call->iter = &call->def_iter;
-		iov_iter_discard(&call->def_iter, READ, call->count2 * 3 * 4);
+		iov_iter_discard(&call->def_iter, ITER_DEST, call->count2 * 3 * 4);
 		call->unmarshall++;
 
 		fallthrough;
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 230c2d19116d..104df2964225 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -305,7 +305,7 @@ expand:
 	req->actual_len = i_size; /* May change */
 	req->len = nr_pages * PAGE_SIZE; /* We can ask for more than there is */
 	req->data_version = dvnode->status.data_version; /* May change */
-	iov_iter_xarray(&req->def_iter, READ, &dvnode->netfs.inode.i_mapping->i_pages,
+	iov_iter_xarray(&req->def_iter, ITER_DEST, &dvnode->netfs.inode.i_mapping->i_pages,
 			0, i_size);
 	req->iter = &req->def_iter;
 
diff --git a/fs/afs/file.c b/fs/afs/file.c
index d1cfb235c4b9..2eeab57df133 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -324,7 +324,7 @@ static void afs_issue_read(struct netfs_io_subrequest *subreq)
 	fsreq->vnode	= vnode;
 	fsreq->iter	= &fsreq->def_iter;
 
-	iov_iter_xarray(&fsreq->def_iter, READ,
+	iov_iter_xarray(&fsreq->def_iter, ITER_DEST,
 			&fsreq->vnode->netfs.inode.i_mapping->i_pages,
 			fsreq->pos, fsreq->len);
 
@@ -346,7 +346,7 @@ static int afs_symlink_read_folio(struct file *file, struct folio *folio)
 	fsreq->len	= folio_size(folio);
 	fsreq->vnode	= vnode;
 	fsreq->iter	= &fsreq->def_iter;
-	iov_iter_xarray(&fsreq->def_iter, READ, &folio->mapping->i_pages,
+	iov_iter_xarray(&fsreq->def_iter, ITER_DEST, &folio->mapping->i_pages,
 			fsreq->pos, fsreq->len);
 
 	ret = afs_fetch_data(fsreq->vnode, fsreq);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 723d162078a3..9ba7b68375c9 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -1301,7 +1301,7 @@ static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t si
 	call->iov_len = size;
 	call->kvec[0].iov_base = buf;
 	call->kvec[0].iov_len = size;
-	iov_iter_kvec(&call->def_iter, READ, call->kvec, 1, size);
+	iov_iter_kvec(&call->def_iter, ITER_DEST, call->kvec, 1, size);
 }
 
 static inline void afs_extract_to_tmp(struct afs_call *call)
@@ -1319,7 +1319,7 @@ static inline void afs_extract_to_tmp64(struct afs_call *call)
 static inline void afs_extract_discard(struct afs_call *call, size_t size)
 {
 	call->iov_len = size;
-	iov_iter_discard(&call->def_iter, READ, size);
+	iov_iter_discard(&call->def_iter, ITER_DEST, size);
 }
 
 static inline void afs_extract_to_buf(struct afs_call *call, size_t size)
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index eccc3cd0cb70..c62939e5ea1f 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -359,7 +359,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
 
 	msg.msg_name		= NULL;
 	msg.msg_namelen		= 0;
-	iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, call->request_size);
+	iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, iov, 1, call->request_size);
 	msg.msg_control		= NULL;
 	msg.msg_controllen	= 0;
 	msg.msg_flags		= MSG_WAITALL | (call->write_iter ? MSG_MORE : 0);
@@ -400,7 +400,7 @@ error_do_abort:
 					RX_USER_ABORT, ret, "KSD");
 	} else {
 		len = 0;
-		iov_iter_kvec(&msg.msg_iter, READ, NULL, 0, 0);
+		iov_iter_kvec(&msg.msg_iter, ITER_DEST, NULL, 0, 0);
 		rxrpc_kernel_recv_data(call->net->socket, rxcall,
 				       &msg.msg_iter, &len, false,
 				       &call->abort_code, &call->service_id);
@@ -485,7 +485,7 @@ static void afs_deliver_to_call(struct afs_call *call)
 	       ) {
 		if (state == AFS_CALL_SV_AWAIT_ACK) {
 			len = 0;
-			iov_iter_kvec(&call->def_iter, READ, NULL, 0, 0);
+			iov_iter_kvec(&call->def_iter, ITER_DEST, NULL, 0, 0);
 			ret = rxrpc_kernel_recv_data(call->net->socket,
 						     call->rxcall, &call->def_iter,
 						     &len, false, &remote_abort,
@@ -822,7 +822,7 @@ void afs_send_empty_reply(struct afs_call *call)
 
 	msg.msg_name		= NULL;
 	msg.msg_namelen		= 0;
-	iov_iter_kvec(&msg.msg_iter, WRITE, NULL, 0, 0);
+	iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, NULL, 0, 0);
 	msg.msg_control		= NULL;
 	msg.msg_controllen	= 0;
 	msg.msg_flags		= 0;
@@ -862,7 +862,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
 	iov[0].iov_len		= len;
 	msg.msg_name		= NULL;
 	msg.msg_namelen		= 0;
-	iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, len);
+	iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, iov, 1, len);
 	msg.msg_control		= NULL;
 	msg.msg_controllen	= 0;
 	msg.msg_flags		= 0;
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 9ebdd36eaf2f..08fd456dde67 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -609,7 +609,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping,
 		 */
 		afs_write_to_cache(vnode, start, len, i_size, caching);
 
-		iov_iter_xarray(&iter, WRITE, &mapping->i_pages, start, len);
+		iov_iter_xarray(&iter, ITER_SOURCE, &mapping->i_pages, start, len);
 		ret = afs_store_data(vnode, &iter, start, false);
 	} else {
 		_debug("write discard %x @%llx [%llx]", len, start, i_size);
@@ -1000,7 +1000,7 @@ int afs_launder_folio(struct folio *folio)
 		bv[0].bv_page = &folio->page;
 		bv[0].bv_offset = f;
 		bv[0].bv_len = t - f;
-		iov_iter_bvec(&iter, WRITE, bv, 1, bv[0].bv_len);
+		iov_iter_bvec(&iter, ITER_SOURCE, bv, 1, bv[0].bv_len);
 
 		trace_afs_folio_dirty(vnode, tracepoint_string("launder"), folio);
 		ret = afs_store_data(vnode, &iter, folio_pos(folio) + f, true);
diff --git a/fs/aio.c b/fs/aio.c
index 5b2ff20ad322..562916d85cba 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1552,7 +1552,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb,
 	if (unlikely(!file->f_op->read_iter))
 		return -EINVAL;
 
-	ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter);
+	ret = aio_setup_rw(ITER_DEST, iocb, &iovec, vectored, compat, &iter);
 	if (ret < 0)
 		return ret;
 	ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter));
@@ -1580,7 +1580,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
 	if (unlikely(!file->f_op->write_iter))
 		return -EINVAL;
 
-	ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter);
+	ret = aio_setup_rw(ITER_SOURCE, iocb, &iovec, vectored, compat, &iter);
 	if (ret < 0)
 		return ret;
 	ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d5dd8bed1488..a59c884c2cb0 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -5283,7 +5283,7 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
 		goto out_acct;
 	}
 
-	ret = import_iovec(READ, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
+	ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
 			   &iov, &iter);
 	if (ret < 0)
 		goto out_acct;
@@ -5382,7 +5382,7 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool
 	if (args.len > args.unencoded_len - args.unencoded_offset)
 		goto out_acct;
 
-	ret = import_iovec(WRITE, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
+	ret = import_iovec(ITER_SOURCE, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
 			   &iov, &iter);
 	if (ret < 0)
 		goto out_acct;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index dcf701b05cc1..61f47debec5a 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -288,7 +288,7 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
 	}
 
 	len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len);
-	iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len);
+	iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len);
 	err = copy_to_iter(iinfo->inline_data + subreq->start, len, &iter);
 	if (err == 0)
 		err = -EFAULT;
@@ -327,7 +327,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
 	}
 
 	dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len);
-	iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len);
+	iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len);
 	err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off);
 	if (err < 0) {
 		dout("%s: iov_ter_get_pages_alloc returned %d\n", __func__, err);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 04fd34557de8..6f9580defb2b 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1161,7 +1161,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
 				aio_req->total_len = rc + zlen;
 			}
 
-			iov_iter_bvec(&i, READ, osd_data->bvec_pos.bvecs,
+			iov_iter_bvec(&i, ITER_DEST, osd_data->bvec_pos.bvecs,
 				      osd_data->num_bvecs, len);
 			iov_iter_advance(&i, rc);
 			iov_iter_zero(zlen, &i);
@@ -1400,7 +1400,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 				int zlen = min_t(size_t, len - ret,
 						 size - pos - ret);
 
-				iov_iter_bvec(&i, READ, bvecs, num_pages, len);
+				iov_iter_bvec(&i, ITER_DEST, bvecs, num_pages, len);
 				iov_iter_advance(&i, ret);
 				iov_iter_zero(zlen, &i);
 				ret += zlen;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 9db9527c61cf..e80252a83225 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -759,7 +759,7 @@ cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
 {
 	struct msghdr smb_msg = {};
 	struct kvec iov = {.iov_base = buf, .iov_len = to_read};
-	iov_iter_kvec(&smb_msg.msg_iter, READ, &iov, 1, to_read);
+	iov_iter_kvec(&smb_msg.msg_iter, ITER_DEST, &iov, 1, to_read);
 
 	return cifs_readv_from_socket(server, &smb_msg);
 }
@@ -774,7 +774,7 @@ cifs_discard_from_socket(struct TCP_Server_Info *server, size_t to_read)
 	 *  and cifs_readv_from_socket sets msg_control and msg_controllen
 	 *  so little to initialize in struct msghdr
 	 */
-	iov_iter_discard(&smb_msg.msg_iter, READ, to_read);
+	iov_iter_discard(&smb_msg.msg_iter, ITER_DEST, to_read);
 
 	return cifs_readv_from_socket(server, &smb_msg);
 }
@@ -786,7 +786,7 @@ cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page,
 	struct msghdr smb_msg = {};
 	struct bio_vec bv = {
 		.bv_page = page, .bv_len = to_read, .bv_offset = page_offset};
-	iov_iter_bvec(&smb_msg.msg_iter, READ, &bv, 1, to_read);
+	iov_iter_bvec(&smb_msg.msg_iter, ITER_DEST, &bv, 1, to_read);
 	return cifs_readv_from_socket(server, &smb_msg);
 }
 
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index cd9698209930..209dfc06fd6d 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3532,7 +3532,7 @@ static ssize_t __cifs_writev(
 		ctx->iter = *from;
 		ctx->len = len;
 	} else {
-		rc = setup_aio_ctx_iter(ctx, from, WRITE);
+		rc = setup_aio_ctx_iter(ctx, from, ITER_SOURCE);
 		if (rc) {
 			kref_put(&ctx->refcount, cifs_aio_ctx_release);
 			return rc;
@@ -4276,7 +4276,7 @@ static ssize_t __cifs_readv(
 		ctx->iter = *to;
 		ctx->len = len;
 	} else {
-		rc = setup_aio_ctx_iter(ctx, to, READ);
+		rc = setup_aio_ctx_iter(ctx, to, ITER_DEST);
 		if (rc) {
 			kref_put(&ctx->refcount, cifs_aio_ctx_release);
 			return rc;
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index a1751b956318..f6f3a6b75601 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -150,7 +150,7 @@ static int fscache_fallback_read_page(struct inode *inode, struct page *page)
 	bvec[0].bv_page		= page;
 	bvec[0].bv_offset	= 0;
 	bvec[0].bv_len		= PAGE_SIZE;
-	iov_iter_bvec(&iter, READ, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
+	iov_iter_bvec(&iter, ITER_DEST, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
 
 	ret = fscache_begin_read_operation(&cres, cookie);
 	if (ret < 0)
@@ -180,7 +180,7 @@ static int fscache_fallback_write_page(struct inode *inode, struct page *page,
 	bvec[0].bv_page		= page;
 	bvec[0].bv_offset	= 0;
 	bvec[0].bv_len		= PAGE_SIZE;
-	iov_iter_bvec(&iter, WRITE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
+	iov_iter_bvec(&iter, ITER_SOURCE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
 
 	ret = fscache_begin_write_operation(&cres, cookie);
 	if (ret < 0)
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index bfaafd02fb1f..32b3877b538a 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -4723,13 +4723,13 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
 			return 0;
 		}
 
-		iov_iter_bvec(&iter, WRITE, bvec, npages, data_len);
+		iov_iter_bvec(&iter, ITER_SOURCE, bvec, npages, data_len);
 	} else if (buf_len >= data_offset + data_len) {
 		/* read response payload is in buf */
 		WARN_ONCE(npages > 0, "read data can be either in buf or in pages");
 		iov.iov_base = buf + data_offset;
 		iov.iov_len = data_len;
-		iov_iter_kvec(&iter, WRITE, &iov, 1, data_len);
+		iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, data_len);
 	} else {
 		/* read response payload cannot be in both buf and pages */
 		WARN_ONCE(1, "buf can not contain only a part of read data");
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 575fa8f58342..3851d0aaa288 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -347,7 +347,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
 			.iov_base = &rfc1002_marker,
 			.iov_len  = 4
 		};
-		iov_iter_kvec(&smb_msg.msg_iter, WRITE, &hiov, 1, 4);
+		iov_iter_kvec(&smb_msg.msg_iter, ITER_SOURCE, &hiov, 1, 4);
 		rc = smb_send_kvec(server, &smb_msg, &sent);
 		if (rc < 0)
 			goto unmask;
@@ -368,7 +368,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
 			size += iov[i].iov_len;
 		}
 
-		iov_iter_kvec(&smb_msg.msg_iter, WRITE, iov, n_vec, size);
+		iov_iter_kvec(&smb_msg.msg_iter, ITER_SOURCE, iov, n_vec, size);
 
 		rc = smb_send_kvec(server, &smb_msg, &sent);
 		if (rc < 0)
@@ -384,7 +384,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
 			rqst_page_get_length(&rqst[j], i, &bvec.bv_len,
 					     &bvec.bv_offset);
 
-			iov_iter_bvec(&smb_msg.msg_iter, WRITE,
+			iov_iter_bvec(&smb_msg.msg_iter, ITER_SOURCE,
 				      &bvec, 1, bvec.bv_len);
 			rc = smb_send_kvec(server, &smb_msg, &sent);
 			if (rc < 0)
diff --git a/fs/coredump.c b/fs/coredump.c
index 7bad7785e8e6..095ed821c8ac 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -853,7 +853,7 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page)
 	if (dump_interrupted())
 		return 0;
 	pos = file->f_pos;
-	iov_iter_bvec(&iter, WRITE, &bvec, 1, PAGE_SIZE);
+	iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE);
 	n = __kernel_write_iter(cprm->file, &iter, &pos);
 	if (n != PAGE_SIZE)
 		return 0;
diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index af5ed6b9c54d..4c837be3b6e3 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -194,7 +194,7 @@ static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie,
 
 		atomic_inc(&rreq->nr_outstanding);
 
-		iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages,
+		iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages,
 				start + done, subreq->len);
 
 		ret = fscache_read(cres, subreq->start, &iter,
@@ -290,7 +290,7 @@ static int erofs_fscache_data_read(struct address_space *mapping,
 		if (IS_ERR(src))
 			return PTR_ERR(src);
 
-		iov_iter_xarray(&iter, READ, &mapping->i_pages, pos, PAGE_SIZE);
+		iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, PAGE_SIZE);
 		if (copy_to_iter(src + offset, size, &iter) != size) {
 			erofs_put_metabuf(&buf);
 			return -EFAULT;
@@ -302,7 +302,7 @@ static int erofs_fscache_data_read(struct address_space *mapping,
 
 	if (!(map.m_flags & EROFS_MAP_MAPPED)) {
 		count = len;
-		iov_iter_xarray(&iter, READ, &mapping->i_pages, pos, count);
+		iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, count);
 		iov_iter_zero(count, &iter);
 		return count;
 	}
diff --git a/fs/fscache/io.c b/fs/fscache/io.c
index 3af3b08a9bb3..0d2b8dec8f82 100644
--- a/fs/fscache/io.c
+++ b/fs/fscache/io.c
@@ -286,7 +286,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie,
 	 * taken into account.
 	 */
 
-	iov_iter_xarray(&iter, WRITE, &mapping->i_pages, start, len);
+	iov_iter_xarray(&iter, ITER_SOURCE, &mapping->i_pages, start, len);
 	fscache_write(cres, start, &iter, fscache_wreq_done, wreq);
 	return;
 
diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c
index 61d8afcb10a3..fcce94ace2c2 100644
--- a/fs/fuse/ioctl.c
+++ b/fs/fuse/ioctl.c
@@ -255,7 +255,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
 		ap.args.in_pages = true;
 
 		err = -EFAULT;
-		iov_iter_init(&ii, WRITE, in_iov, in_iovs, in_size);
+		iov_iter_init(&ii, ITER_SOURCE, in_iov, in_iovs, in_size);
 		for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) {
 			c = copy_page_from_iter(ap.pages[i], 0, PAGE_SIZE, &ii);
 			if (c != PAGE_SIZE && iov_iter_count(&ii))
@@ -324,7 +324,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
 		goto out;
 
 	err = -EFAULT;
-	iov_iter_init(&ii, READ, out_iov, out_iovs, transferred);
+	iov_iter_init(&ii, ITER_DEST, out_iov, out_iovs, transferred);
 	for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) {
 		c = copy_page_to_iter(ap.pages[i], 0, PAGE_SIZE, &ii);
 		if (c != PAGE_SIZE && iov_iter_count(&ii))
diff --git a/fs/netfs/io.c b/fs/netfs/io.c
index e374767d1b68..7f753380e047 100644
--- a/fs/netfs/io.c
+++ b/fs/netfs/io.c
@@ -23,7 +23,7 @@ static void netfs_clear_unread(struct netfs_io_subrequest *subreq)
 {
 	struct iov_iter iter;
 
-	iov_iter_xarray(&iter, READ, &subreq->rreq->mapping->i_pages,
+	iov_iter_xarray(&iter, ITER_DEST, &subreq->rreq->mapping->i_pages,
 			subreq->start + subreq->transferred,
 			subreq->len   - subreq->transferred);
 	iov_iter_zero(iov_iter_count(&iter), &iter);
@@ -49,7 +49,7 @@ static void netfs_read_from_cache(struct netfs_io_request *rreq,
 	struct iov_iter iter;
 
 	netfs_stat(&netfs_n_rh_read);
-	iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages,
+	iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages,
 			subreq->start + subreq->transferred,
 			subreq->len   - subreq->transferred);
 
@@ -208,7 +208,7 @@ static void netfs_rreq_do_write_to_cache(struct netfs_io_request *rreq)
 			continue;
 		}
 
-		iov_iter_xarray(&iter, WRITE, &rreq->mapping->i_pages,
+		iov_iter_xarray(&iter, ITER_SOURCE, &rreq->mapping->i_pages,
 				subreq->start, subreq->len);
 
 		atomic_inc(&rreq->nr_copy_ops);
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index e861d7bae305..e731c00a9fcb 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -252,7 +252,7 @@ static int fscache_fallback_read_page(struct inode *inode, struct page *page)
 	bvec[0].bv_page		= page;
 	bvec[0].bv_offset	= 0;
 	bvec[0].bv_len		= PAGE_SIZE;
-	iov_iter_bvec(&iter, READ, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
+	iov_iter_bvec(&iter, ITER_DEST, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
 
 	ret = fscache_begin_read_operation(&cres, cookie);
 	if (ret < 0)
@@ -282,7 +282,7 @@ static int fscache_fallback_write_page(struct inode *inode, struct page *page,
 	bvec[0].bv_page		= page;
 	bvec[0].bv_offset	= 0;
 	bvec[0].bv_len		= PAGE_SIZE;
-	iov_iter_bvec(&iter, WRITE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
+	iov_iter_bvec(&iter, ITER_SOURCE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
 
 	ret = fscache_begin_write_operation(&cres, cookie);
 	if (ret < 0)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index f650afedd67f..51f453baa952 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -942,7 +942,7 @@ __be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	ssize_t host_err;
 
 	trace_nfsd_read_vector(rqstp, fhp, offset, *count);
-	iov_iter_kvec(&iter, READ, vec, vlen, *count);
+	iov_iter_kvec(&iter, ITER_DEST, vec, vlen, *count);
 	host_err = vfs_iter_read(file, &iter, &ppos, 0);
 	return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
 }
@@ -1032,7 +1032,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
 	if (stable && !use_wgather)
 		flags |= RWF_SYNC;
 
-	iov_iter_kvec(&iter, WRITE, vec, vlen, *cnt);
+	iov_iter_kvec(&iter, ITER_SOURCE, vec, vlen, *cnt);
 	since = READ_ONCE(file->f_wb_err);
 	if (verf)
 		nfsd_copy_write_verifier(verf, nn);
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index f660c0dbdb63..785cabd71d67 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -900,7 +900,7 @@ static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len)
 {
 	struct kvec vec = { .iov_len = len, .iov_base = data, };
 	struct msghdr msg = { .msg_flags = MSG_DONTWAIT, };
-	iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, len);
+	iov_iter_kvec(&msg.msg_iter, ITER_DEST, &vec, 1, len);
 	return sock_recvmsg(sock, &msg, MSG_DONTWAIT);
 }
 
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 7a8c0c6e698d..b3bbb5a5787a 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -53,7 +53,7 @@ static int orangefs_writepage_locked(struct page *page,
 	bv.bv_len = wlen;
 	bv.bv_offset = off % PAGE_SIZE;
 	WARN_ON(wlen == 0);
-	iov_iter_bvec(&iter, WRITE, &bv, 1, wlen);
+	iov_iter_bvec(&iter, ITER_SOURCE, &bv, 1, wlen);
 
 	ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen,
 	    len, wr, NULL, NULL);
@@ -112,7 +112,7 @@ static int orangefs_writepages_work(struct orangefs_writepages *ow,
 		else
 			ow->bv[i].bv_offset = 0;
 	}
-	iov_iter_bvec(&iter, WRITE, ow->bv, ow->npages, ow->len);
+	iov_iter_bvec(&iter, ITER_SOURCE, ow->bv, ow->npages, ow->len);
 
 	WARN_ON(ow->off >= len);
 	if (ow->off + ow->len > len)
@@ -270,7 +270,7 @@ static void orangefs_readahead(struct readahead_control *rac)
 	offset = readahead_pos(rac);
 	i_pages = &rac->mapping->i_pages;
 
-	iov_iter_xarray(&iter, READ, i_pages, offset, readahead_length(rac));
+	iov_iter_xarray(&iter, ITER_DEST, i_pages, offset, readahead_length(rac));
 
 	/* read in the pages. */
 	if ((ret = wait_for_direct_io(ORANGEFS_IO_READ, inode,
@@ -303,7 +303,7 @@ static int orangefs_read_folio(struct file *file, struct folio *folio)
 	bv.bv_page = &folio->page;
 	bv.bv_len = folio_size(folio);
 	bv.bv_offset = 0;
-	iov_iter_bvec(&iter, READ, &bv, 1, folio_size(folio));
+	iov_iter_bvec(&iter, ITER_DEST, &bv, 1, folio_size(folio));
 
 	ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter,
 			folio_size(folio), inode->i_size, NULL, NULL, file);
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index f2aa86c421f2..5aa527ca6dbe 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -199,7 +199,7 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
 	struct kvec kvec = { .iov_base = buf, .iov_len = count };
 	struct iov_iter iter;
 
-	iov_iter_kvec(&iter, READ, &kvec, 1, count);
+	iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count);
 
 	return read_from_oldmem(&iter, count, ppos, false);
 }
@@ -212,7 +212,7 @@ ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos)
 	struct kvec kvec = { .iov_base = buf, .iov_len = count };
 	struct iov_iter iter;
 
-	iov_iter_kvec(&iter, READ, &kvec, 1, count);
+	iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count);
 
 	return read_from_oldmem(&iter, count, ppos,
 			cc_platform_has(CC_ATTR_MEM_ENCRYPT));
@@ -437,7 +437,7 @@ static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf)
 		offset = (loff_t) index << PAGE_SHIFT;
 		kvec.iov_base = page_address(page);
 		kvec.iov_len = PAGE_SIZE;
-		iov_iter_kvec(&iter, READ, &kvec, 1, PAGE_SIZE);
+		iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, PAGE_SIZE);
 
 		rc = __read_vmcore(&iter, &offset);
 		if (rc < 0) {
diff --git a/fs/read_write.c b/fs/read_write.c
index 328ce8cf9a85..37c2f28b51e8 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -384,7 +384,7 @@ static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, lo
 
 	init_sync_kiocb(&kiocb, filp);
 	kiocb.ki_pos = (ppos ? *ppos : 0);
-	iov_iter_ubuf(&iter, READ, buf, len);
+	iov_iter_ubuf(&iter, ITER_DEST, buf, len);
 
 	ret = call_read_iter(filp, &kiocb, &iter);
 	BUG_ON(ret == -EIOCBQUEUED);
@@ -424,7 +424,7 @@ ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
 
 	init_sync_kiocb(&kiocb, file);
 	kiocb.ki_pos = pos ? *pos : 0;
-	iov_iter_kvec(&iter, READ, &iov, 1, iov.iov_len);
+	iov_iter_kvec(&iter, ITER_DEST, &iov, 1, iov.iov_len);
 	ret = file->f_op->read_iter(&kiocb, &iter);
 	if (ret > 0) {
 		if (pos)
@@ -486,7 +486,7 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t
 
 	init_sync_kiocb(&kiocb, filp);
 	kiocb.ki_pos = (ppos ? *ppos : 0);
-	iov_iter_ubuf(&iter, WRITE, (void __user *)buf, len);
+	iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len);
 
 	ret = call_write_iter(filp, &kiocb, &iter);
 	BUG_ON(ret == -EIOCBQUEUED);
@@ -533,7 +533,7 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t
 		.iov_len	= min_t(size_t, count, MAX_RW_COUNT),
 	};
 	struct iov_iter iter;
-	iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len);
+	iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, iov.iov_len);
 	return __kernel_write_iter(file, &iter, pos);
 }
 /*
@@ -911,7 +911,7 @@ static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
 	struct iov_iter iter;
 	ssize_t ret;
 
-	ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
+	ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
 	if (ret >= 0) {
 		ret = do_iter_read(file, &iter, pos, flags);
 		kfree(iov);
@@ -928,7 +928,7 @@ static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
 	struct iov_iter iter;
 	ssize_t ret;
 
-	ret = import_iovec(WRITE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
+	ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
 	if (ret >= 0) {
 		file_start_write(file);
 		ret = do_iter_write(file, &iter, pos, flags);
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 9456a2032224..f5fdaf3b1572 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -156,7 +156,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
 	ssize_t ret;
 
 	init_sync_kiocb(&kiocb, file);
-	iov_iter_init(&iter, READ, &iov, 1, size);
+	iov_iter_init(&iter, ITER_DEST, &iov, 1, size);
 
 	kiocb.ki_pos = *ppos;
 	ret = seq_read_iter(&kiocb, &iter);
diff --git a/fs/splice.c b/fs/splice.c
index 0878b852b355..5969b7a1d353 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -303,7 +303,7 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
 	struct kiocb kiocb;
 	int ret;
 
-	iov_iter_pipe(&to, READ, pipe, len);
+	iov_iter_pipe(&to, ITER_DEST, pipe, len);
 	init_sync_kiocb(&kiocb, in);
 	kiocb.ki_pos = *ppos;
 	ret = call_read_iter(in, &kiocb, &to);
@@ -682,7 +682,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 			n++;
 		}
 
-		iov_iter_bvec(&from, WRITE, array, n, sd.total_len - left);
+		iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left);
 		ret = vfs_iter_write(out, &from, &sd.pos, 0);
 		if (ret <= 0)
 			break;
@@ -1263,9 +1263,9 @@ static int vmsplice_type(struct fd f, int *type)
 	if (!f.file)
 		return -EBADF;
 	if (f.file->f_mode & FMODE_WRITE) {
-		*type = WRITE;
+		*type = ITER_SOURCE;
 	} else if (f.file->f_mode & FMODE_READ) {
-		*type = READ;
+		*type = ITER_DEST;
 	} else {
 		fdput(f);
 		return -EBADF;
@@ -1314,7 +1314,7 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
 
 	if (!iov_iter_count(&iter))
 		error = 0;
-	else if (iov_iter_rw(&iter) == WRITE)
+	else if (type == ITER_SOURCE)
 		error = vmsplice_to_pipe(f.file, &iter, flags);
 	else
 		error = vmsplice_to_user(f.file, &iter, flags);
-- 
cgit 


From aa997990080877c3a6dab9f25609073816378b43 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 25 Nov 2022 11:20:47 -0800
Subject: fscrypt: add comment for fscrypt_valid_enc_modes_v1()

Make it clear that nothing new should be added to this function.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20221125192047.18916-1-ebiggers@kernel.org
---
 fs/crypto/policy.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 46757c3052ef..84fa51604b15 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -61,6 +61,13 @@ fscrypt_get_dummy_policy(struct super_block *sb)
 	return sb->s_cop->get_dummy_policy(sb);
 }
 
+/*
+ * Return %true if the given combination of encryption modes is supported for v1
+ * (and later) encryption policies.
+ *
+ * Do *not* add anything new here, since v1 encryption policies are deprecated.
+ * New combinations of modes should go in fscrypt_valid_enc_modes_v2() only.
+ */
 static bool fscrypt_valid_enc_modes_v1(u32 contents_mode, u32 filenames_mode)
 {
 	if (contents_mode == FSCRYPT_MODE_AES_256_XTS &&
-- 
cgit 


From cf260db405b1a159e9076220b40f5f02ac480525 Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Fri, 25 Nov 2022 17:13:57 +0800
Subject: btrfs: replace INT_LIMIT(loff_t) with OFFSET_MAX

OFFSET_MAX is self-annotated and more readable.

Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Acked-by: David Sterba <dsterba@suse.com>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/ordered-data.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index e54f8280031f..100d9f4836b1 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -761,11 +761,11 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 	struct btrfs_ordered_extent *ordered;
 
 	if (start + len < start) {
-		orig_end = INT_LIMIT(loff_t);
+		orig_end = OFFSET_MAX;
 	} else {
 		orig_end = start + len - 1;
-		if (orig_end > INT_LIMIT(loff_t))
-			orig_end = INT_LIMIT(loff_t);
+		if (orig_end > OFFSET_MAX)
+			orig_end = OFFSET_MAX;
 	}
 
 	/* start IO across the range first to instantiate any delalloc
-- 
cgit 


From 0eb43812c0270ee3d005ff32f91f7d0a6c4943af Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 26 Aug 2022 19:44:44 -0400
Subject: NFS: Clear the file access cache upon login

POSIX typically only refreshes the user's supplementary group
information upon login. Since NFS servers may often refresh their
concept of the user supplementary group membership at their own cadence,
it is possible for the NFS client's access cache to become stale due to
the user's group membership changing on the server after the user has
already logged in on the client.
While it is reasonable to expect that such group membership changes are
rare, and that we do not want to optimise the cache to accommodate them,
it is also not unreasonable for the user to expect that if they log out
and log back in again, that the staleness would clear up.

Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index f594dac436a7..bccb04af38ae 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2948,9 +2948,28 @@ static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, co
 	return NULL;
 }
 
+static u64 nfs_access_login_time(const struct task_struct *task,
+				 const struct cred *cred)
+{
+	const struct task_struct *parent;
+	u64 ret;
+
+	rcu_read_lock();
+	for (;;) {
+		parent = rcu_dereference(task->real_parent);
+		if (parent == task || cred_fscmp(parent->cred, cred) != 0)
+			break;
+		task = parent;
+	}
+	ret = task->start_time;
+	rcu_read_unlock();
+	return ret;
+}
+
 static int nfs_access_get_cached_locked(struct inode *inode, const struct cred *cred, u32 *mask, bool may_block)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
+	u64 login_time = nfs_access_login_time(current, cred);
 	struct nfs_access_entry *cache;
 	bool retry = true;
 	int err;
@@ -2978,6 +2997,9 @@ static int nfs_access_get_cached_locked(struct inode *inode, const struct cred *
 		spin_lock(&inode->i_lock);
 		retry = false;
 	}
+	err = -ENOENT;
+	if ((s64)(login_time - cache->timestamp) > 0)
+		goto out;
 	*mask = cache->mask;
 	list_move_tail(&cache->lru, &nfsi->access_cache_entry_lru);
 	err = 0;
@@ -3057,6 +3079,7 @@ static void nfs_access_add_rbtree(struct inode *inode,
 		else
 			goto found;
 	}
+	set->timestamp = ktime_get_ns();
 	rb_link_node(&set->rb_node, parent, p);
 	rb_insert_color(&set->rb_node, root_node);
 	list_add_tail(&set->lru, &nfsi->access_cache_entry_lru);
-- 
cgit 


From eef7314caf2d73a94b68ba293cd105154d3a664e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 18 Oct 2022 16:44:47 -0400
Subject: NFSv4.2: Clear FATTR4_WORD2_SECURITY_LABEL when done decoding

We need to clear the FATTR4_WORD2_SECURITY_LABEL bitmap flag
irrespective of whether or not the label is too long.

Fixes: aa9c2669626c ("NFS: Client implementation of Labeled-NFS")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4xdr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index acfe5f4bda48..8c5298e37f0f 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4234,6 +4234,7 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap,
 		p = xdr_inline_decode(xdr, len);
 		if (unlikely(!p))
 			return -EIO;
+		bitmap[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
 		if (len < NFS4_MAXLABELLEN) {
 			if (label) {
 				if (label->len) {
@@ -4246,7 +4247,6 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap,
 				label->lfs = lfs;
 				status = NFS_ATTR_FATTR_V4_SECURITY_LABEL;
 			}
-			bitmap[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
 		} else
 			printk(KERN_WARNING "%s: label too long (%u)!\n",
 					__func__, len);
-- 
cgit 


From c8a62f440229ae7a10874776344dfcc17d860336 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 18 Oct 2022 17:41:30 -0400
Subject: NFSv4.2: Always decode the security label

If the server returns a reply that includes a security label, then we
must decode it whether or not we can store the results.

Fixes: 1e2f67da8931 ("NFS: Remove the nfs4_label argument from decode_getattr_*() functions")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4xdr.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 8c5298e37f0f..9103e022376a 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4755,12 +4755,10 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
 	if (status < 0)
 		goto xdr_error;
 
-	if (fattr->label) {
-		status = decode_attr_security_label(xdr, bitmap, fattr->label);
-		if (status < 0)
-			goto xdr_error;
-		fattr->valid |= status;
-	}
+	status = decode_attr_security_label(xdr, bitmap, fattr->label);
+	if (status < 0)
+		goto xdr_error;
+	fattr->valid |= status;
 
 xdr_error:
 	dprintk("%s: xdr returned %d\n", __func__, -status);
-- 
cgit 


From 43c1031f7110967c240cb6e922adcfc4b8899183 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 18 Oct 2022 18:21:14 -0400
Subject: NFSv4.2: Fix a memory stomp in decode_attr_security_label

We must not change the value of label->len if it is zero, since that
indicates we stored a label.

Fixes: b4487b935452 ("nfs: Fix getxattr kernel panic and memory overflow")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4xdr.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 9103e022376a..deec76cf5afe 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4236,12 +4236,10 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap,
 			return -EIO;
 		bitmap[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
 		if (len < NFS4_MAXLABELLEN) {
-			if (label) {
-				if (label->len) {
-					if (label->len < len)
-						return -ERANGE;
-					memcpy(label->label, p, len);
-				}
+			if (label && label->len) {
+				if (label->len < len)
+					return -ERANGE;
+				memcpy(label->label, p, len);
 				label->len = len;
 				label->pi = pi;
 				label->lfs = lfs;
-- 
cgit 


From c528f70f504434eaff993a5ddd52203a2010d51f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 19 Oct 2022 13:12:11 -0400
Subject: NFSv4.2: Fix initialisation of struct nfs4_label

The call to nfs4_label_init_security() should return a fully initialised
label.

Fixes: aa9c2669626c ("NFS: Client implementation of Labeled-NFS")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4proc.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 86ed5c0142c3..98a867092039 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -122,6 +122,11 @@ nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
 	if (nfs_server_capable(dir, NFS_CAP_SECURITY_LABEL) == 0)
 		return NULL;
 
+	label->lfs = 0;
+	label->pi = 0;
+	label->len = 0;
+	label->label = NULL;
+
 	err = security_dentry_init_security(dentry, sattr->ia_mode,
 				&dentry->d_name, NULL,
 				(void **)&label->label, &label->len);
@@ -3796,7 +3801,7 @@ nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx,
 		int open_flags, struct iattr *attr, int *opened)
 {
 	struct nfs4_state *state;
-	struct nfs4_label l = {0, 0, 0, NULL}, *label = NULL;
+	struct nfs4_label l, *label;
 
 	label = nfs4_label_init_security(dir, ctx->dentry, attr, &l);
 
@@ -4682,7 +4687,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 		 int flags)
 {
 	struct nfs_server *server = NFS_SERVER(dir);
-	struct nfs4_label l, *ilabel = NULL;
+	struct nfs4_label l, *ilabel;
 	struct nfs_open_context *ctx;
 	struct nfs4_state *state;
 	int status = 0;
@@ -5033,7 +5038,7 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
 	struct nfs4_exception exception = {
 		.interruptible = true,
 	};
-	struct nfs4_label l, *label = NULL;
+	struct nfs4_label l, *label;
 	int err;
 
 	label = nfs4_label_init_security(dir, dentry, sattr, &l);
@@ -5074,7 +5079,7 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
 	struct nfs4_exception exception = {
 		.interruptible = true,
 	};
-	struct nfs4_label l, *label = NULL;
+	struct nfs4_label l, *label;
 	int err;
 
 	label = nfs4_label_init_security(dir, dentry, sattr, &l);
@@ -5193,7 +5198,7 @@ static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
 	struct nfs4_exception exception = {
 		.interruptible = true,
 	};
-	struct nfs4_label l, *label = NULL;
+	struct nfs4_label l, *label;
 	int err;
 
 	label = nfs4_label_init_security(dir, dentry, sattr, &l);
-- 
cgit 


From 85aa8ddc3818718208c3cfdfda9c8c908c9dead1 Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Tue, 20 Sep 2022 13:00:21 -0400
Subject: NFS: Trigger the "ls -l" readdir heuristic sooner

Since commit 1a34c8c9a49e ("NFS: Support larger readdir buffers") has
updated dtsize, and with recent improvements to the READDIRPLUS helper
heuristic, the heuristic may not trigger until many dentries are emitted
to userspace.   This will cause many thousands of GETATTR calls for "ls
-l" when the directory's pagecache has already been populated.  This
manifests as poor performance for long directory listings after an
initially fast "ls -l".

Fix this by emitting only 17 entries for any first pass through the NFS
directory's ->iterate_shared(), which allows userpace to prime the
counters for the heuristic.

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index bccb04af38ae..ea1ceffa1d3a 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1074,6 +1074,8 @@ static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc)
 	return res;
 }
 
+#define NFS_READDIR_CACHE_MISS_THRESHOLD (16UL)
+
 /*
  * Once we've found the start of the dirent within a page: fill 'er up...
  */
@@ -1083,6 +1085,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
 	struct file	*file = desc->file;
 	struct nfs_cache_array *array;
 	unsigned int i;
+	bool first_emit = !desc->dir_cookie;
 
 	array = kmap_local_page(desc->page);
 	for (i = desc->cache_entry_index; i < array->size; i++) {
@@ -1106,6 +1109,10 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
 			desc->ctx->pos = desc->dir_cookie;
 		else
 			desc->ctx->pos++;
+		if (first_emit && i > NFS_READDIR_CACHE_MISS_THRESHOLD + 1) {
+			desc->eob = true;
+			break;
+		}
 	}
 	if (array->page_is_eof)
 		desc->eof = !desc->eob;
@@ -1187,8 +1194,6 @@ out:
 	return status;
 }
 
-#define NFS_READDIR_CACHE_MISS_THRESHOLD (16UL)
-
 static bool nfs_readdir_handle_cache_misses(struct inode *inode,
 					    struct nfs_readdir_descriptor *desc,
 					    unsigned int cache_misses,
-- 
cgit 


From e83458fce080dc23c25353a1af90bfecf79c7369 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Thu, 27 Oct 2022 16:50:12 -0400
Subject: NFSv4: Fix a credential leak in _nfs4_discover_trunking()

Fixes: 4f40a5b55446 ("NFSv4: Add an fattr allocation to _nfs4_discover_trunking()")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4proc.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 98a867092039..bd89c7f06952 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4018,7 +4018,7 @@ static int _nfs4_discover_trunking(struct nfs_server *server,
 
 	page = alloc_page(GFP_KERNEL);
 	if (!page)
-		return -ENOMEM;
+		goto out_put_cred;
 	locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
 	if (!locations)
 		goto out_free;
@@ -4040,6 +4040,8 @@ out_free_2:
 	kfree(locations);
 out_free:
 	__free_page(page);
+out_put_cred:
+	put_cred(cred);
 	return status;
 }
 
-- 
cgit 


From 51069e4aef6257b0454057359faed0ab0c9af083 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 4 Nov 2022 13:20:01 -0400
Subject: NFSv4: Fix a deadlock between nfs4_open_recover_helper() and
 delegreturn

If we're asked to recover open state while a delegation return is
outstanding, then the state manager thread cannot use a cached open, so
if the server returns a delegation, we can end up deadlocked behind the
pending delegreturn.
To avoid this problem, let's just ask the server not to give us a
delegation unless we're explicitly reclaiming one.

Fixes: be36e185bd26 ("NFSv4: nfs4_open_recover_helper() must set share access")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4proc.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index bd89c7f06952..e51044a5f550 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2131,18 +2131,18 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
 }
 
 static int nfs4_open_recover_helper(struct nfs4_opendata *opendata,
-		fmode_t fmode)
+				    fmode_t fmode)
 {
 	struct nfs4_state *newstate;
+	struct nfs_server *server = NFS_SB(opendata->dentry->d_sb);
+	int openflags = opendata->o_arg.open_flags;
 	int ret;
 
 	if (!nfs4_mode_match_open_stateid(opendata->state, fmode))
 		return 0;
-	opendata->o_arg.open_flags = 0;
 	opendata->o_arg.fmode = fmode;
-	opendata->o_arg.share_access = nfs4_map_atomic_open_share(
-			NFS_SB(opendata->dentry->d_sb),
-			fmode, 0);
+	opendata->o_arg.share_access =
+		nfs4_map_atomic_open_share(server, fmode, openflags);
 	memset(&opendata->o_res, 0, sizeof(opendata->o_res));
 	memset(&opendata->c_res, 0, sizeof(opendata->c_res));
 	nfs4_init_opendata_res(opendata);
@@ -2724,10 +2724,15 @@ static int _nfs4_open_expired(struct nfs_open_context *ctx, struct nfs4_state *s
 	struct nfs4_opendata *opendata;
 	int ret;
 
-	opendata = nfs4_open_recoverdata_alloc(ctx, state,
-			NFS4_OPEN_CLAIM_FH);
+	opendata = nfs4_open_recoverdata_alloc(ctx, state, NFS4_OPEN_CLAIM_FH);
 	if (IS_ERR(opendata))
 		return PTR_ERR(opendata);
+	/*
+	 * We're not recovering a delegation, so ask for no delegation.
+	 * Otherwise the recovery thread could deadlock with an outstanding
+	 * delegation return.
+	 */
+	opendata->o_arg.open_flags = O_DIRECT;
 	ret = nfs4_open_recover(opendata, state);
 	if (ret == -ESTALE)
 		d_drop(ctx->dentry);
-- 
cgit 


From 35e3b6ae84935d0d7ff76cbdaa83411b0ad5e471 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 14 Nov 2022 17:30:39 -0500
Subject: NFS: Fix an Oops in nfs_d_automount()

When mounting from a NFSv4 referral, path->dentry can end up being a
negative dentry, so derive the struct nfs_server from the dentry
itself instead.

Fixes: 2b0143b5c986 ("VFS: normal filesystems (and lustre): d_inode() annotations")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/namespace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 2f336ace7555..88a23af2bd5c 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -147,7 +147,7 @@ struct vfsmount *nfs_d_automount(struct path *path)
 	struct nfs_fs_context *ctx;
 	struct fs_context *fc;
 	struct vfsmount *mnt = ERR_PTR(-ENOMEM);
-	struct nfs_server *server = NFS_SERVER(d_inode(path->dentry));
+	struct nfs_server *server = NFS_SB(path->dentry->d_sb);
 	struct nfs_client *client = server->nfs_client;
 	int timeout = READ_ONCE(nfs_mountpoint_expiry_timeout);
 	int ret;
-- 
cgit 


From 5776a9cd2a66cdc037c8e6ee34cde40ee768893d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 16 Nov 2022 11:11:53 -0500
Subject: NFS: Fix a race in nfs_call_unlink()

We should check that the filehandles match before transferring the
sillyrename data to the newly looked-up dentry in case the name was
reused on the server.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/unlink.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 9697cd5d2561..150a953a8be9 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -139,6 +139,7 @@ static int nfs_call_unlink(struct dentry *dentry, struct inode *inode, struct nf
 		 */
 		spin_lock(&alias->d_lock);
 		if (d_really_is_positive(alias) &&
+		    !nfs_compare_fh(NFS_FH(inode), NFS_FH(d_inode(alias))) &&
 		    !(alias->d_flags & DCACHE_NFSFS_RENAMED)) {
 			devname_garbage = alias->d_fsdata;
 			alias->d_fsdata = data;
-- 
cgit 


From 672e4268b2863d7e4978dfed29552b31c2f9bd4e Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Mon, 28 Nov 2022 11:33:05 +0100
Subject: ovl: fix use inode directly in rcu-walk mode

ovl_dentry_revalidate_common() can be called in rcu-walk mode.  As document
said, "in rcu-walk mode, d_parent and d_inode should not be used without
care".

Check inode here to protect access under rcu-walk mode.

Fixes: bccece1ead36 ("ovl: allow remote upper")
Reported-and-tested-by: syzbot+a4055c78774bbf3498bb@syzkaller.appspotmail.com
Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
Cc: <stable@vger.kernel.org> # v5.7
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/overlayfs/super.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index a29a8afe9b26..3d14a3f1465d 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -139,11 +139,16 @@ static int ovl_dentry_revalidate_common(struct dentry *dentry,
 					unsigned int flags, bool weak)
 {
 	struct ovl_entry *oe = dentry->d_fsdata;
+	struct inode *inode = d_inode_rcu(dentry);
 	struct dentry *upper;
 	unsigned int i;
 	int ret = 1;
 
-	upper = ovl_dentry_upper(dentry);
+	/* Careful in RCU mode */
+	if (!inode)
+		return -ECHILD;
+
+	upper = ovl_i_dentry_upper(inode);
 	if (upper)
 		ret = ovl_revalidate_real(upper, flags, weak);
 
-- 
cgit 


From 27e714c007e4ad01837bf0fac5c11913a38d7695 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 26 Nov 2022 03:17:17 +0000
Subject: ext2: unbugger ext2_empty_dir()

In 27cfa258951a "ext2: fix fs corruption when trying to remove
a non-empty directory with IO error" a funny thing has happened:

-               page = ext2_get_page(inode, i, dir_has_error, &page_addr);
+               page = ext2_get_page(inode, i, 0, &page_addr);

 -               if (IS_ERR(page)) {
 -                       dir_has_error = 1;
 -                       continue;
 -               }
 +               if (IS_ERR(page))
 +                       goto not_empty;

And at not_empty: we hit ext2_put_page(page, page_addr), which does
put_page(page).  Which, unless I'm very mistaken, should oops
immediately when given ERR_PTR(-E...) as page.

OK, shit happens, insufficiently tested patches included.  But when
commit in question describes the fault-injection test that exercised
that particular failure exit...

Ow.

CC: stable@vger.kernel.org
Fixes: 27cfa258951a ("ext2: fix fs corruption when trying to remove a non-empty directory with IO error")
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext2/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 6fa714dbee84..e5cbc27ba459 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -684,7 +684,7 @@ int ext2_empty_dir (struct inode * inode)
 		page = ext2_get_page(inode, i, 0, &page_addr);
 
 		if (IS_ERR(page))
-			goto not_empty;
+			return 0;
 
 		kaddr = page_addr;
 		de = (ext2_dirent *)kaddr;
-- 
cgit 


From 456b59e757b0c558df550764a4fd5ae6877e93f8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 24 Nov 2022 17:03:11 +0000
Subject: ovl: update ->f_iocb_flags when ovl_change_flags() modifies ->f_flags

ovl_change_flags() is an open-coded variant of fs/fcntl.c:setfl() and it
got missed by commit 164f4064ca81 ("keep iocb_flags() result cached in
struct file"); the same change applies there.

Reported-by: Pierre Labastie <pierre.labastie@neuf.fr>
Fixes: 164f4064ca81 ("keep iocb_flags() result cached in struct file")
Cc: <stable@vger.kernel.org> # v6.0
Link: https://bugzilla.kernel.org/show_bug.cgi?id=216738
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/overlayfs/file.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index a1a22f58ba18..dd688a842b0b 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -96,6 +96,7 @@ static int ovl_change_flags(struct file *file, unsigned int flags)
 
 	spin_lock(&file->f_lock);
 	file->f_flags = (file->f_flags & ~OVL_SETFL_MASK) | flags;
+	file->f_iocb_flags = iocb_flags(file);
 	spin_unlock(&file->f_lock);
 
 	return 0;
-- 
cgit 


From b41b98e12a954c306e8eb9527ada0a70db339683 Mon Sep 17 00:00:00 2001
From: Rong Tao <rongtao@cestc.cn>
Date: Mon, 28 Nov 2022 20:05:49 +0800
Subject: fs/ext2: Fix code indentation

ts=4 can cause misunderstanding in code reading. It is better to replace
8 spaces with one tab.

Signed-off-by: Rong Tao <rongtao@cestc.cn>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext2/balloc.c | 12 ++++++------
 fs/ext2/super.c  |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 5dc0a31f4a08..eca60b747c6b 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -667,7 +667,7 @@ ext2_try_to_allocate(struct super_block *sb, int group,
 {
 	ext2_fsblk_t group_first_block = ext2_group_first_block_no(sb, group);
 	ext2_fsblk_t group_last_block = ext2_group_last_block_no(sb, group);
-       	ext2_grpblk_t start, end;
+	ext2_grpblk_t start, end;
 	unsigned long num = 0;
 
 	start = 0;
@@ -1481,11 +1481,11 @@ unsigned long ext2_count_free_blocks (struct super_block * sb)
 		desc_count, bitmap_count);
 	return bitmap_count;
 #else
-        for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
-                desc = ext2_get_group_desc (sb, i, NULL);
-                if (!desc)
-                        continue;
-                desc_count += le16_to_cpu(desc->bg_free_blocks_count);
+	for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
+		desc = ext2_get_group_desc(sb, i, NULL);
+		if (!desc)
+			continue;
+		desc_count += le16_to_cpu(desc->bg_free_blocks_count);
 	}
 	return desc_count;
 #endif
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 03f2af98b1b4..69c88facfe90 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1648,7 +1648,7 @@ static int __init init_ext2_fs(void)
 	err = init_inodecache();
 	if (err)
 		return err;
-        err = register_filesystem(&ext2_fs_type);
+	err = register_filesystem(&ext2_fs_type);
 	if (err)
 		goto out;
 	return 0;
-- 
cgit 


From eeadcb75794516839078c28b3730132aeb700ce6 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@Netapp.com>
Date: Tue, 13 Sep 2022 14:01:51 -0400
Subject: NFSD: Simplify READ_PLUS

Chuck had suggested reverting READ_PLUS so it returns a single DATA
segment covering the requested read range. This prepares the server for
a future "sparse read" function so support can easily be added without
needing to rip out the old READ_PLUS code at the same time.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4xdr.c | 139 +++++++++++++-----------------------------------------
 1 file changed, 32 insertions(+), 107 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index bcfeb1a922c0..1e4990fb1692 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -4767,79 +4767,37 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
 
 static __be32
 nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
-			    struct nfsd4_read *read,
-			    unsigned long *maxcount, u32 *eof,
-			    loff_t *pos)
+			    struct nfsd4_read *read)
 {
-	struct xdr_stream *xdr = resp->xdr;
+	bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags);
 	struct file *file = read->rd_nf->nf_file;
-	int starting_len = xdr->buf->len;
-	loff_t hole_pos;
-	__be32 nfserr;
-	__be32 *p, tmp;
-	__be64 tmp64;
-
-	hole_pos = pos ? *pos : vfs_llseek(file, read->rd_offset, SEEK_HOLE);
-	if (hole_pos > read->rd_offset)
-		*maxcount = min_t(unsigned long, *maxcount, hole_pos - read->rd_offset);
-	*maxcount = min_t(unsigned long, *maxcount, (xdr->buf->buflen - xdr->buf->len));
+	struct xdr_stream *xdr = resp->xdr;
+	unsigned long maxcount;
+	__be32 nfserr, *p;
 
 	/* Content type, offset, byte count */
 	p = xdr_reserve_space(xdr, 4 + 8 + 4);
 	if (!p)
-		return nfserr_resource;
+		return nfserr_io;
+	if (resp->xdr->buf->page_len && splice_ok) {
+		WARN_ON_ONCE(splice_ok);
+		return nfserr_serverfault;
+	}
 
-	read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, *maxcount);
-	if (read->rd_vlen < 0)
-		return nfserr_resource;
+	maxcount = min_t(unsigned long, read->rd_length,
+			 (xdr->buf->buflen - xdr->buf->len));
 
-	nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
-			    resp->rqstp->rq_vec, read->rd_vlen, maxcount, eof);
+	if (file->f_op->splice_read && splice_ok)
+		nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount);
+	else
+		nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
 	if (nfserr)
 		return nfserr;
-	xdr_truncate_encode(xdr, starting_len + 16 + xdr_align_size(*maxcount));
-
-	tmp = htonl(NFS4_CONTENT_DATA);
-	write_bytes_to_xdr_buf(xdr->buf, starting_len,      &tmp,   4);
-	tmp64 = cpu_to_be64(read->rd_offset);
-	write_bytes_to_xdr_buf(xdr->buf, starting_len + 4,  &tmp64, 8);
-	tmp = htonl(*maxcount);
-	write_bytes_to_xdr_buf(xdr->buf, starting_len + 12, &tmp,   4);
-
-	tmp = xdr_zero;
-	write_bytes_to_xdr_buf(xdr->buf, starting_len + 16 + *maxcount, &tmp,
-			       xdr_pad_size(*maxcount));
-	return nfs_ok;
-}
-
-static __be32
-nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp,
-			    struct nfsd4_read *read,
-			    unsigned long *maxcount, u32 *eof)
-{
-	struct file *file = read->rd_nf->nf_file;
-	loff_t data_pos = vfs_llseek(file, read->rd_offset, SEEK_DATA);
-	loff_t f_size = i_size_read(file_inode(file));
-	unsigned long count;
-	__be32 *p;
-
-	if (data_pos == -ENXIO)
-		data_pos = f_size;
-	else if (data_pos <= read->rd_offset || (data_pos < f_size && data_pos % PAGE_SIZE))
-		return nfsd4_encode_read_plus_data(resp, read, maxcount, eof, &f_size);
-	count = data_pos - read->rd_offset;
 
-	/* Content type, offset, byte count */
-	p = xdr_reserve_space(resp->xdr, 4 + 8 + 8);
-	if (!p)
-		return nfserr_resource;
-
-	*p++ = htonl(NFS4_CONTENT_HOLE);
+	*p++ = cpu_to_be32(NFS4_CONTENT_DATA);
 	p = xdr_encode_hyper(p, read->rd_offset);
-	p = xdr_encode_hyper(p, count);
+	*p = cpu_to_be32(read->rd_length);
 
-	*eof = (read->rd_offset + count) >= f_size;
-	*maxcount = min_t(unsigned long, count, *maxcount);
 	return nfs_ok;
 }
 
@@ -4847,69 +4805,36 @@ static __be32
 nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
 		       struct nfsd4_read *read)
 {
-	unsigned long maxcount, count;
+	struct file *file = read->rd_nf->nf_file;
 	struct xdr_stream *xdr = resp->xdr;
-	struct file *file;
 	int starting_len = xdr->buf->len;
-	int last_segment = xdr->buf->len;
-	int segments = 0;
-	__be32 *p, tmp;
-	bool is_data;
-	loff_t pos;
-	u32 eof;
+	u32 segments = 0;
+	__be32 *p;
 
 	if (nfserr)
 		return nfserr;
-	file = read->rd_nf->nf_file;
 
 	/* eof flag, segment count */
 	p = xdr_reserve_space(xdr, 4 + 4);
 	if (!p)
-		return nfserr_resource;
+		return nfserr_io;
 	xdr_commit_encode(xdr);
 
-	maxcount = min_t(unsigned long, read->rd_length,
-			 (xdr->buf->buflen - xdr->buf->len));
-	count    = maxcount;
-
-	eof = read->rd_offset >= i_size_read(file_inode(file));
-	if (eof)
+	read->rd_eof = read->rd_offset >= i_size_read(file_inode(file));
+	if (read->rd_eof)
 		goto out;
 
-	pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE);
-	is_data = pos > read->rd_offset;
-
-	while (count > 0 && !eof) {
-		maxcount = count;
-		if (is_data)
-			nfserr = nfsd4_encode_read_plus_data(resp, read, &maxcount, &eof,
-						segments == 0 ? &pos : NULL);
-		else
-			nfserr = nfsd4_encode_read_plus_hole(resp, read, &maxcount, &eof);
-		if (nfserr)
-			goto out;
-		count -= maxcount;
-		read->rd_offset += maxcount;
-		is_data = !is_data;
-		last_segment = xdr->buf->len;
-		segments++;
-	}
-
-out:
-	if (nfserr && segments == 0)
+	nfserr = nfsd4_encode_read_plus_data(resp, read);
+	if (nfserr) {
 		xdr_truncate_encode(xdr, starting_len);
-	else {
-		if (nfserr) {
-			xdr_truncate_encode(xdr, last_segment);
-			nfserr = nfs_ok;
-			eof = 0;
-		}
-		tmp = htonl(eof);
-		write_bytes_to_xdr_buf(xdr->buf, starting_len,     &tmp, 4);
-		tmp = htonl(segments);
-		write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4);
+		return nfserr;
 	}
 
+	segments++;
+
+out:
+	p = xdr_encode_bool(p, read->rd_eof);
+	*p = cpu_to_be32(segments);
 	return nfserr;
 }
 
-- 
cgit 


From 69eed23baf877bbb1f14d7f4df54f89807c9ee2a Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Mon, 10 Oct 2022 21:24:23 +0100
Subject: NFSD: Remove redundant assignment to variable host_err

Variable host_err is assigned a value that is never read, it is being
re-assigned a value in every different execution path in the following
switch statement. The assignment is redundant and can be removed.

Cleans up clang-scan warning:
warning: Value stored to 'host_err' is never read [deadcode.DeadStores]

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/vfs.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 849a720ab43f..d6d90d63485d 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1305,7 +1305,6 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		iap->ia_mode &= ~current_umask();
 
 	err = 0;
-	host_err = 0;
 	switch (type) {
 	case S_IFREG:
 		host_err = vfs_create(&init_user_ns, dirp, dchild, iap->ia_mode, true);
-- 
cgit 


From ea5021e911d3479346a75ac9b7d9dcd751b0fb99 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 16 Oct 2022 11:47:02 -0400
Subject: NFSD: Finish converting the NFSv2 GETACL result encoder

The xdr_stream conversion inadvertently left some code that set the
page_len of the send buffer. The XDR stream encoders should handle
this automatically now.

This oversight adds garbage past the end of the Reply message.
Clients typically ignore the garbage, but NFSD does not need to send
it, as it leaks stale memory contents onto the wire.

Fixes: f8cba47344f7 ("NFSD: Update the NFSv2 GETACL result encoder to use struct xdr_stream")
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs2acl.c | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 13e6e6897f6c..65d4511b7af0 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -246,7 +246,6 @@ nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
 	struct nfsd3_getaclres *resp = rqstp->rq_resp;
 	struct dentry *dentry = resp->fh.fh_dentry;
 	struct inode *inode;
-	int w;
 
 	if (!svcxdr_encode_stat(xdr, resp->status))
 		return false;
@@ -260,15 +259,6 @@ nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
 	if (xdr_stream_encode_u32(xdr, resp->mask) < 0)
 		return false;
 
-	rqstp->rq_res.page_len = w = nfsacl_size(
-		(resp->mask & NFS_ACL)   ? resp->acl_access  : NULL,
-		(resp->mask & NFS_DFACL) ? resp->acl_default : NULL);
-	while (w > 0) {
-		if (!*(rqstp->rq_next_page++))
-			return true;
-		w -= PAGE_SIZE;
-	}
-
 	if (!nfs_stream_encode_acl(xdr, inode, resp->acl_access,
 				   resp->mask & NFS_ACL, 0))
 		return false;
-- 
cgit 


From 841fd0a3cb490eae5dfd262eccb8c8b11d57f8b8 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 16 Oct 2022 11:47:08 -0400
Subject: NFSD: Finish converting the NFSv3 GETACL result encoder

For some reason, the NFSv2 GETACL result encoder was fully converted
to use the new nfs_stream_encode_acl(), but the NFSv3 equivalent was
not similarly converted.

Fixes: 20798dfe249a ("NFSD: Update the NFSv3 GETACL result encoder to use struct xdr_stream")
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs3acl.c | 30 ++++++------------------------
 1 file changed, 6 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 2fb9ee356455..a34a22e272ad 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -171,11 +171,7 @@ nfs3svc_encode_getaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
 {
 	struct nfsd3_getaclres *resp = rqstp->rq_resp;
 	struct dentry *dentry = resp->fh.fh_dentry;
-	struct kvec *head = rqstp->rq_res.head;
 	struct inode *inode;
-	unsigned int base;
-	int n;
-	int w;
 
 	if (!svcxdr_encode_nfsstat3(xdr, resp->status))
 		return false;
@@ -187,26 +183,12 @@ nfs3svc_encode_getaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
 		if (xdr_stream_encode_u32(xdr, resp->mask) < 0)
 			return false;
 
-		base = (char *)xdr->p - (char *)head->iov_base;
-
-		rqstp->rq_res.page_len = w = nfsacl_size(
-			(resp->mask & NFS_ACL)   ? resp->acl_access  : NULL,
-			(resp->mask & NFS_DFACL) ? resp->acl_default : NULL);
-		while (w > 0) {
-			if (!*(rqstp->rq_next_page++))
-				return false;
-			w -= PAGE_SIZE;
-		}
-
-		n = nfsacl_encode(&rqstp->rq_res, base, inode,
-				  resp->acl_access,
-				  resp->mask & NFS_ACL, 0);
-		if (n > 0)
-			n = nfsacl_encode(&rqstp->rq_res, base + n, inode,
-					  resp->acl_default,
-					  resp->mask & NFS_DFACL,
-					  NFS_ACL_DEFAULT);
-		if (n <= 0)
+		if (!nfs_stream_encode_acl(xdr, inode, resp->acl_access,
+					   resp->mask & NFS_ACL, 0))
+			return false;
+		if (!nfs_stream_encode_acl(xdr, inode, resp->acl_default,
+					   resp->mask & NFS_DFACL,
+					   NFS_ACL_DEFAULT))
 			return false;
 		break;
 	default:
-- 
cgit 


From 8e823bafff2308753d430566256c83d8085952da Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Tue, 18 Oct 2022 07:47:54 -0400
Subject: nfsd: ignore requests to disable unsupported versions

The kernel currently errors out if you attempt to enable or disable a
version that it doesn't recognize. Change it to ignore attempts to
disable an unrecognized version. If we don't support it, then there is
no harm in doing so.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Tom Talpey <tom@talpey.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfsctl.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index dc74a947a440..68ed42fd29fc 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -601,7 +601,9 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 				}
 				break;
 			default:
-				return -EINVAL;
+				/* Ignore requests to disable non-existent versions */
+				if (cmd == NFSD_SET)
+					return -EINVAL;
 			}
 			vers += len + 1;
 		} while ((len = qword_get(&mesg, vers, size)) > 0);
-- 
cgit 


From cb12fae1c34b1fa7eaae92c5aadc72d86d7fae19 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Tue, 18 Oct 2022 07:47:55 -0400
Subject: nfsd: move nfserrno() to vfs.c

nfserrno() is common to all nfs versions, but nfsproc.c is specifically
for NFSv2. Move it to vfs.c, and the prototype to vfs.h.

While we're in here, remove the #ifdef EDQUOT check in this function.
It's apparently a holdover from the initial merge of the nfsd code in
1997. No other place in the kernel checks that that symbol is defined
before using it, so I think we can dispense with it here.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/blocklayout.c    |  1 +
 fs/nfsd/blocklayoutxdr.c |  1 +
 fs/nfsd/export.h         |  1 -
 fs/nfsd/flexfilelayout.c |  1 +
 fs/nfsd/nfs4idmap.c      |  1 +
 fs/nfsd/nfsproc.c        | 62 -----------------------------------------------
 fs/nfsd/vfs.c            | 63 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/vfs.h            |  1 +
 8 files changed, 68 insertions(+), 63 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index b6d01d51a746..04697f8dc37d 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -12,6 +12,7 @@
 #include "blocklayoutxdr.h"
 #include "pnfs.h"
 #include "filecache.h"
+#include "vfs.h"
 
 #define NFSDDBG_FACILITY	NFSDDBG_PNFS
 
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index 442543304930..8e9c1a0f8d38 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -9,6 +9,7 @@
 
 #include "nfsd.h"
 #include "blocklayoutxdr.h"
+#include "vfs.h"
 
 #define NFSDDBG_FACILITY	NFSDDBG_PNFS
 
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index ee0e3aba4a6e..d03f7f6a8642 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -115,7 +115,6 @@ struct svc_export *	rqst_find_fsidzero_export(struct svc_rqst *);
 int			exp_rootfh(struct net *, struct auth_domain *,
 					char *path, struct knfsd_fh *, int maxsize);
 __be32			exp_pseudoroot(struct svc_rqst *, struct svc_fh *);
-__be32			nfserrno(int errno);
 
 static inline void exp_put(struct svc_export *exp)
 {
diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c
index 070f90ed09b6..3ca5304440ff 100644
--- a/fs/nfsd/flexfilelayout.c
+++ b/fs/nfsd/flexfilelayout.c
@@ -15,6 +15,7 @@
 
 #include "flexfilelayoutxdr.h"
 #include "pnfs.h"
+#include "vfs.h"
 
 #define NFSDDBG_FACILITY	NFSDDBG_PNFS
 
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index e70a1a2999b7..5e9809aff37e 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -41,6 +41,7 @@
 #include "idmap.h"
 #include "nfsd.h"
 #include "netns.h"
+#include "vfs.h"
 
 /*
  * Turn off idmapping when using AUTH_SYS.
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 82b3ddeacc33..52fc222c34f2 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -848,65 +848,3 @@ const struct svc_version nfsd_version2 = {
 	.vs_dispatch	= nfsd_dispatch,
 	.vs_xdrsize	= NFS2_SVC_XDRSIZE,
 };
-
-/*
- * Map errnos to NFS errnos.
- */
-__be32
-nfserrno (int errno)
-{
-	static struct {
-		__be32	nfserr;
-		int	syserr;
-	} nfs_errtbl[] = {
-		{ nfs_ok, 0 },
-		{ nfserr_perm, -EPERM },
-		{ nfserr_noent, -ENOENT },
-		{ nfserr_io, -EIO },
-		{ nfserr_nxio, -ENXIO },
-		{ nfserr_fbig, -E2BIG },
-		{ nfserr_stale, -EBADF },
-		{ nfserr_acces, -EACCES },
-		{ nfserr_exist, -EEXIST },
-		{ nfserr_xdev, -EXDEV },
-		{ nfserr_mlink, -EMLINK },
-		{ nfserr_nodev, -ENODEV },
-		{ nfserr_notdir, -ENOTDIR },
-		{ nfserr_isdir, -EISDIR },
-		{ nfserr_inval, -EINVAL },
-		{ nfserr_fbig, -EFBIG },
-		{ nfserr_nospc, -ENOSPC },
-		{ nfserr_rofs, -EROFS },
-		{ nfserr_mlink, -EMLINK },
-		{ nfserr_nametoolong, -ENAMETOOLONG },
-		{ nfserr_notempty, -ENOTEMPTY },
-#ifdef EDQUOT
-		{ nfserr_dquot, -EDQUOT },
-#endif
-		{ nfserr_stale, -ESTALE },
-		{ nfserr_jukebox, -ETIMEDOUT },
-		{ nfserr_jukebox, -ERESTARTSYS },
-		{ nfserr_jukebox, -EAGAIN },
-		{ nfserr_jukebox, -EWOULDBLOCK },
-		{ nfserr_jukebox, -ENOMEM },
-		{ nfserr_io, -ETXTBSY },
-		{ nfserr_notsupp, -EOPNOTSUPP },
-		{ nfserr_toosmall, -ETOOSMALL },
-		{ nfserr_serverfault, -ESERVERFAULT },
-		{ nfserr_serverfault, -ENFILE },
-		{ nfserr_io, -EREMOTEIO },
-		{ nfserr_stale, -EOPENSTALE },
-		{ nfserr_io, -EUCLEAN },
-		{ nfserr_perm, -ENOKEY },
-		{ nfserr_no_grace, -ENOGRACE},
-	};
-	int	i;
-
-	for (i = 0; i < ARRAY_SIZE(nfs_errtbl); i++) {
-		if (nfs_errtbl[i].syserr == errno)
-			return nfs_errtbl[i].nfserr;
-	}
-	WARN_ONCE(1, "nfsd: non-standard errno: %d\n", errno);
-	return nfserr_io;
-}
-
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index d6d90d63485d..278ebc677061 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -49,6 +49,69 @@
 
 #define NFSDDBG_FACILITY		NFSDDBG_FILEOP
 
+/**
+ * nfserrno - Map Linux errnos to NFS errnos
+ * @errno: POSIX(-ish) error code to be mapped
+ *
+ * Returns the appropriate (net-endian) nfserr_* (or nfs_ok if errno is 0). If
+ * it's an error we don't expect, log it once and return nfserr_io.
+ */
+__be32
+nfserrno (int errno)
+{
+	static struct {
+		__be32	nfserr;
+		int	syserr;
+	} nfs_errtbl[] = {
+		{ nfs_ok, 0 },
+		{ nfserr_perm, -EPERM },
+		{ nfserr_noent, -ENOENT },
+		{ nfserr_io, -EIO },
+		{ nfserr_nxio, -ENXIO },
+		{ nfserr_fbig, -E2BIG },
+		{ nfserr_stale, -EBADF },
+		{ nfserr_acces, -EACCES },
+		{ nfserr_exist, -EEXIST },
+		{ nfserr_xdev, -EXDEV },
+		{ nfserr_mlink, -EMLINK },
+		{ nfserr_nodev, -ENODEV },
+		{ nfserr_notdir, -ENOTDIR },
+		{ nfserr_isdir, -EISDIR },
+		{ nfserr_inval, -EINVAL },
+		{ nfserr_fbig, -EFBIG },
+		{ nfserr_nospc, -ENOSPC },
+		{ nfserr_rofs, -EROFS },
+		{ nfserr_mlink, -EMLINK },
+		{ nfserr_nametoolong, -ENAMETOOLONG },
+		{ nfserr_notempty, -ENOTEMPTY },
+		{ nfserr_dquot, -EDQUOT },
+		{ nfserr_stale, -ESTALE },
+		{ nfserr_jukebox, -ETIMEDOUT },
+		{ nfserr_jukebox, -ERESTARTSYS },
+		{ nfserr_jukebox, -EAGAIN },
+		{ nfserr_jukebox, -EWOULDBLOCK },
+		{ nfserr_jukebox, -ENOMEM },
+		{ nfserr_io, -ETXTBSY },
+		{ nfserr_notsupp, -EOPNOTSUPP },
+		{ nfserr_toosmall, -ETOOSMALL },
+		{ nfserr_serverfault, -ESERVERFAULT },
+		{ nfserr_serverfault, -ENFILE },
+		{ nfserr_io, -EREMOTEIO },
+		{ nfserr_stale, -EOPENSTALE },
+		{ nfserr_io, -EUCLEAN },
+		{ nfserr_perm, -ENOKEY },
+		{ nfserr_no_grace, -ENOGRACE},
+	};
+	int	i;
+
+	for (i = 0; i < ARRAY_SIZE(nfs_errtbl); i++) {
+		if (nfs_errtbl[i].syserr == errno)
+			return nfs_errtbl[i].nfserr;
+	}
+	WARN_ONCE(1, "nfsd: non-standard errno: %d\n", errno);
+	return nfserr_io;
+}
+
 /* 
  * Called from nfsd_lookup and encode_dirent. Check if we have crossed 
  * a mount point.
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 120521bc7b24..8ddd687f8359 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -60,6 +60,7 @@ static inline void nfsd_attrs_free(struct nfsd_attrs *attrs)
 	posix_acl_release(attrs->na_dpacl);
 }
 
+__be32		nfserrno (int errno);
 int		nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
 		                struct svc_export **expp);
 __be32		nfsd_lookup(struct svc_rqst *, struct svc_fh *,
-- 
cgit 


From 2f3a4b2ac2f28b9be78ad21f401f31e263845214 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Tue, 18 Oct 2022 07:47:56 -0400
Subject: nfsd: allow disabling NFSv2 at compile time

rpc.nfsd stopped supporting NFSv2 a year ago. Take the next logical
step toward deprecating it and allow NFSv2 support to be compiled out.

Add a new CONFIG_NFSD_V2 option that can be turned off and rework the
CONFIG_NFSD_V?_ACL option dependencies. Add a description that
discourages enabling it.

Also, change the description of CONFIG_NFSD to state that the always-on
version is now 3 instead of 2.

Finally, add an #ifdef around "case 2:" in __write_versions. When NFSv2
is disabled at compile time, this should make the kernel ignore attempts
to disable it at runtime, but still error out when trying to enable it.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Tom Talpey <tom@talpey.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/Kconfig  | 19 +++++++++++++++----
 fs/nfsd/Makefile |  5 +++--
 fs/nfsd/nfsctl.c |  2 ++
 fs/nfsd/nfsd.h   |  3 +--
 fs/nfsd/nfssvc.c |  6 ++++++
 5 files changed, 27 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index f6a2fd3015e7..7c441f2bd444 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -8,6 +8,7 @@ config NFSD
 	select SUNRPC
 	select EXPORTFS
 	select NFS_ACL_SUPPORT if NFSD_V2_ACL
+	select NFS_ACL_SUPPORT if NFSD_V3_ACL
 	depends on MULTIUSER
 	help
 	  Choose Y here if you want to allow other computers to access
@@ -26,19 +27,29 @@ config NFSD
 
 	  Below you can choose which versions of the NFS protocol are
 	  available to clients mounting the NFS server on this system.
-	  Support for NFS version 2 (RFC 1094) is always available when
+	  Support for NFS version 3 (RFC 1813) is always available when
 	  CONFIG_NFSD is selected.
 
 	  If unsure, say N.
 
-config NFSD_V2_ACL
-	bool
+config NFSD_V2
+	bool "NFS server support for NFS version 2 (DEPRECATED)"
 	depends on NFSD
+	default n
+	help
+	  NFSv2 (RFC 1094) was the first publicly-released version of NFS.
+	  Unless you are hosting ancient (1990's era) NFS clients, you don't
+	  need this.
+
+	  If unsure, say N.
+
+config NFSD_V2_ACL
+	bool "NFS server support for the NFSv2 ACL protocol extension"
+	depends on NFSD_V2
 
 config NFSD_V3_ACL
 	bool "NFS server support for the NFSv3 ACL protocol extension"
 	depends on NFSD
-	select NFSD_V2_ACL
 	help
 	  Solaris NFS servers support an auxiliary NFSv3 ACL protocol that
 	  never became an official part of the NFS version 3 protocol.
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 805c06d5f1b4..6fffc8f03f74 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -10,9 +10,10 @@ obj-$(CONFIG_NFSD)	+= nfsd.o
 # this one should be compiled first, as the tracing macros can easily blow up
 nfsd-y			+= trace.o
 
-nfsd-y 			+= nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
-			   export.o auth.o lockd.o nfscache.o nfsxdr.o \
+nfsd-y 			+= nfssvc.o nfsctl.o nfsfh.o vfs.o \
+			   export.o auth.o lockd.o nfscache.o \
 			   stats.o filecache.o nfs3proc.o nfs3xdr.o
+nfsd-$(CONFIG_NFSD_V2) += nfsproc.o nfsxdr.o
 nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
 nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
 nfsd-$(CONFIG_NFSD_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 68ed42fd29fc..d1e581a60480 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -581,7 +581,9 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 
 			cmd = sign == '-' ? NFSD_CLEAR : NFSD_SET;
 			switch(num) {
+#ifdef CONFIG_NFSD_V2
 			case 2:
+#endif
 			case 3:
 				nfsd_vers(nn, num, cmd);
 				break;
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 09726c5b9a31..93b42ef9ed91 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -64,8 +64,7 @@ struct readdir_cd {
 
 
 extern struct svc_program	nfsd_program;
-extern const struct svc_version	nfsd_version2, nfsd_version3,
-				nfsd_version4;
+extern const struct svc_version	nfsd_version2, nfsd_version3, nfsd_version4;
 extern struct mutex		nfsd_mutex;
 extern spinlock_t		nfsd_drc_lock;
 extern unsigned long		nfsd_drc_max_mem;
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index bfbd9f672f59..62e473b0ca52 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -91,8 +91,12 @@ unsigned long	nfsd_drc_mem_used;
 #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
 static struct svc_stat	nfsd_acl_svcstats;
 static const struct svc_version *nfsd_acl_version[] = {
+# if defined(CONFIG_NFSD_V2_ACL)
 	[2] = &nfsd_acl_version2,
+# endif
+# if defined(CONFIG_NFSD_V3_ACL)
 	[3] = &nfsd_acl_version3,
+# endif
 };
 
 #define NFSD_ACL_MINVERS            2
@@ -116,7 +120,9 @@ static struct svc_stat	nfsd_acl_svcstats = {
 #endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */
 
 static const struct svc_version *nfsd_version[] = {
+#if defined(CONFIG_NFSD_V2)
 	[2] = &nfsd_version2,
+#endif
 	[3] = &nfsd_version3,
 #if defined(CONFIG_NFSD_V4)
 	[4] = &nfsd_version4,
-- 
cgit 


From 427505ffeaa464f683faba945a88d3e3248f6979 Mon Sep 17 00:00:00 2001
From: David Disseldorp <ddiss@suse.de>
Date: Fri, 21 Oct 2022 14:24:14 +0200
Subject: exportfs: use pr_debug for unreachable debug statements

expfs.c has a bunch of dprintk statements which are unusable due to:
 #define dprintk(fmt, args...) do{}while(0)
Use pr_debug so that they can be enabled dynamically.
Also make some minor changes to the debug statements to fix some
incorrect types, and remove __func__ which can be handled by dynamic
debug separately.

Signed-off-by: David Disseldorp <ddiss@suse.de>
Reviewed-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/exportfs/expfs.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index c648a493faf2..3204bd33e4e8 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -18,7 +18,7 @@
 #include <linux/sched.h>
 #include <linux/cred.h>
 
-#define dprintk(fmt, args...) do{}while(0)
+#define dprintk(fmt, args...) pr_debug(fmt, ##args)
 
 
 static int get_name(const struct path *path, char *name, struct dentry *child);
@@ -132,8 +132,8 @@ static struct dentry *reconnect_one(struct vfsmount *mnt,
 	inode_unlock(dentry->d_inode);
 
 	if (IS_ERR(parent)) {
-		dprintk("%s: get_parent of %ld failed, err %d\n",
-			__func__, dentry->d_inode->i_ino, PTR_ERR(parent));
+		dprintk("get_parent of %lu failed, err %ld\n",
+			dentry->d_inode->i_ino, PTR_ERR(parent));
 		return parent;
 	}
 
@@ -147,7 +147,7 @@ static struct dentry *reconnect_one(struct vfsmount *mnt,
 	dprintk("%s: found name: %s\n", __func__, nbuf);
 	tmp = lookup_one_unlocked(mnt_user_ns(mnt), nbuf, parent, strlen(nbuf));
 	if (IS_ERR(tmp)) {
-		dprintk("%s: lookup failed: %d\n", __func__, PTR_ERR(tmp));
+		dprintk("lookup failed: %ld\n", PTR_ERR(tmp));
 		err = PTR_ERR(tmp);
 		goto out_err;
 	}
-- 
cgit 


From e0aa651068bfd520afcd357af8ecd2de005fc83d Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 28 Oct 2022 08:13:53 -0400
Subject: nfsd: don't call nfsd_file_put from client states seqfile display

We had a report of this:

    BUG: sleeping function called from invalid context at fs/nfsd/filecache.c:440

...with a stack trace showing nfsd_file_put being called from
nfs4_show_open. This code has always tried to call fput while holding a
spinlock, but we recently changed this to use the filecache, and that
started triggering the might_sleep() in nfsd_file_put.

states_start takes and holds the cl_lock while iterating over the
client's states, and we can't sleep with that held.

Have the various nfs4_show_* functions instead hold the fi_lock instead
of taking a nfsd_file reference.

Fixes: 78599c42ae3c ("nfsd4: add file to display list of client's opens")
Link: https://bugzilla.redhat.com/show_bug.cgi?id=2138357
Reported-by: Zhi Li <yieli@redhat.com>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 51 +++++++++++++++++++++++++++++++++------------------
 1 file changed, 33 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 836bd825ca4a..52b5552d0d70 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -675,15 +675,26 @@ find_any_file(struct nfs4_file *f)
 	return ret;
 }
 
-static struct nfsd_file *find_deleg_file(struct nfs4_file *f)
+static struct nfsd_file *find_any_file_locked(struct nfs4_file *f)
 {
-	struct nfsd_file *ret = NULL;
+	lockdep_assert_held(&f->fi_lock);
+
+	if (f->fi_fds[O_RDWR])
+		return f->fi_fds[O_RDWR];
+	if (f->fi_fds[O_WRONLY])
+		return f->fi_fds[O_WRONLY];
+	if (f->fi_fds[O_RDONLY])
+		return f->fi_fds[O_RDONLY];
+	return NULL;
+}
+
+static struct nfsd_file *find_deleg_file_locked(struct nfs4_file *f)
+{
+	lockdep_assert_held(&f->fi_lock);
 
-	spin_lock(&f->fi_lock);
 	if (f->fi_deleg_file)
-		ret = nfsd_file_get(f->fi_deleg_file);
-	spin_unlock(&f->fi_lock);
-	return ret;
+		return f->fi_deleg_file;
+	return NULL;
 }
 
 static atomic_long_t num_delegations;
@@ -2613,9 +2624,11 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st)
 	ols = openlockstateid(st);
 	oo = ols->st_stateowner;
 	nf = st->sc_file;
-	file = find_any_file(nf);
+
+	spin_lock(&nf->fi_lock);
+	file = find_any_file_locked(nf);
 	if (!file)
-		return 0;
+		goto out;
 
 	seq_printf(s, "- ");
 	nfs4_show_stateid(s, &st->sc_stateid);
@@ -2637,8 +2650,8 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st)
 	seq_printf(s, ", ");
 	nfs4_show_owner(s, oo);
 	seq_printf(s, " }\n");
-	nfsd_file_put(file);
-
+out:
+	spin_unlock(&nf->fi_lock);
 	return 0;
 }
 
@@ -2652,9 +2665,10 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st)
 	ols = openlockstateid(st);
 	oo = ols->st_stateowner;
 	nf = st->sc_file;
-	file = find_any_file(nf);
+	spin_lock(&nf->fi_lock);
+	file = find_any_file_locked(nf);
 	if (!file)
-		return 0;
+		goto out;
 
 	seq_printf(s, "- ");
 	nfs4_show_stateid(s, &st->sc_stateid);
@@ -2674,8 +2688,8 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st)
 	seq_printf(s, ", ");
 	nfs4_show_owner(s, oo);
 	seq_printf(s, " }\n");
-	nfsd_file_put(file);
-
+out:
+	spin_unlock(&nf->fi_lock);
 	return 0;
 }
 
@@ -2687,9 +2701,10 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st)
 
 	ds = delegstateid(st);
 	nf = st->sc_file;
-	file = find_deleg_file(nf);
+	spin_lock(&nf->fi_lock);
+	file = find_deleg_file_locked(nf);
 	if (!file)
-		return 0;
+		goto out;
 
 	seq_printf(s, "- ");
 	nfs4_show_stateid(s, &st->sc_stateid);
@@ -2705,8 +2720,8 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st)
 	seq_printf(s, ", ");
 	nfs4_show_fname(s, file);
 	seq_printf(s, " }\n");
-	nfsd_file_put(file);
-
+out:
+	spin_unlock(&nf->fi_lock);
 	return 0;
 }
 
-- 
cgit 


From c252849082ff525af18b4f253b3c9ece94e951ed Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 28 Oct 2022 10:46:38 -0400
Subject: NFSD: Pass the target nfsd_file to nfsd_commit()

In a moment I'm going to introduce separate nfsd_file types, one of
which is garbage-collected; the other, not. The garbage-collected
variety is to be used by NFSv2 and v3, and the non-garbage-collected
variety is to be used by NFSv4.

nfsd_commit() is invoked by both NFSv3 and NFSv4 consumers. We want
nfsd_commit() to find and use the correct variety of cached
nfsd_file object for the NFS version that is in use.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: NeilBrown <neilb@suse.de>
---
 fs/nfsd/nfs3proc.c | 10 +++++++++-
 fs/nfsd/nfs4proc.c | 11 ++++++++++-
 fs/nfsd/vfs.c      | 15 ++++-----------
 fs/nfsd/vfs.h      |  3 ++-
 4 files changed, 25 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 923d9a80df92..ff2920546333 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -13,6 +13,7 @@
 #include "cache.h"
 #include "xdr3.h"
 #include "vfs.h"
+#include "filecache.h"
 
 #define NFSDDBG_FACILITY		NFSDDBG_PROC
 
@@ -763,6 +764,7 @@ nfsd3_proc_commit(struct svc_rqst *rqstp)
 {
 	struct nfsd3_commitargs *argp = rqstp->rq_argp;
 	struct nfsd3_commitres *resp = rqstp->rq_resp;
+	struct nfsd_file *nf;
 
 	dprintk("nfsd: COMMIT(3)   %s %u@%Lu\n",
 				SVCFH_fmt(&argp->fh),
@@ -770,8 +772,14 @@ nfsd3_proc_commit(struct svc_rqst *rqstp)
 				(unsigned long long) argp->offset);
 
 	fh_copy(&resp->fh, &argp->fh);
-	resp->status = nfsd_commit(rqstp, &resp->fh, argp->offset,
+	resp->status = nfsd_file_acquire(rqstp, &resp->fh, NFSD_MAY_WRITE |
+					 NFSD_MAY_NOT_BREAK_LEASE, &nf);
+	if (resp->status)
+		goto out;
+	resp->status = nfsd_commit(rqstp, &resp->fh, nf, argp->offset,
 				   argp->count, resp->verf);
+	nfsd_file_put(nf);
+out:
 	return rpc_success;
 }
 
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 8beb2bc4c328..42db62413890 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -731,10 +731,19 @@ nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	     union nfsd4_op_u *u)
 {
 	struct nfsd4_commit *commit = &u->commit;
+	struct nfsd_file *nf;
+	__be32 status;
 
-	return nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset,
+	status = nfsd_file_acquire(rqstp, &cstate->current_fh, NFSD_MAY_WRITE |
+				   NFSD_MAY_NOT_BREAK_LEASE, &nf);
+	if (status != nfs_ok)
+		return status;
+
+	status = nfsd_commit(rqstp, &cstate->current_fh, nf, commit->co_offset,
 			     commit->co_count,
 			     (__be32 *)commit->co_verf.data);
+	nfsd_file_put(nf);
+	return status;
 }
 
 static __be32
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 278ebc677061..6f39d6c38116 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1196,6 +1196,7 @@ out:
  * nfsd_commit - Commit pending writes to stable storage
  * @rqstp: RPC request being processed
  * @fhp: NFS filehandle
+ * @nf: target file
  * @offset: raw offset from beginning of file
  * @count: raw count of bytes to sync
  * @verf: filled in with the server's current write verifier
@@ -1212,19 +1213,13 @@ out:
  *   An nfsstat value in network byte order.
  */
 __be32
-nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, u64 offset,
-	    u32 count, __be32 *verf)
+nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
+	    u64 offset, u32 count, __be32 *verf)
 {
+	__be32			err = nfs_ok;
 	u64			maxbytes;
 	loff_t			start, end;
 	struct nfsd_net		*nn;
-	struct nfsd_file	*nf;
-	__be32			err;
-
-	err = nfsd_file_acquire(rqstp, fhp,
-			NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &nf);
-	if (err)
-		goto out;
 
 	/*
 	 * Convert the client-provided (offset, count) range to a
@@ -1265,8 +1260,6 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, u64 offset,
 	} else
 		nfsd_copy_write_verifier(verf, nn);
 
-	nfsd_file_put(nf);
-out:
 	return err;
 }
 
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 8ddd687f8359..dbdfef7ae85b 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -89,7 +89,8 @@ __be32		nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *);
 __be32		nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
 				struct svc_fh *resfhp, struct nfsd_attrs *iap);
 __be32		nfsd_commit(struct svc_rqst *rqst, struct svc_fh *fhp,
-				u64 offset, u32 count, __be32 *verf);
+				struct nfsd_file *nf, u64 offset, u32 count,
+				__be32 *verf);
 #ifdef CONFIG_NFSD_V4
 __be32		nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
 			    char *name, void **bufp, int *lenp);
-- 
cgit 


From dcf3f80965ca787c70def402cdf1553c93c75529 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 28 Oct 2022 10:46:44 -0400
Subject: NFSD: Revert "NFSD: NFSv4 CLOSE should release an nfsd_file
 immediately"

This reverts commit 5e138c4a750dc140d881dab4a8804b094bbc08d2.

That commit attempted to make files available to other users as soon
as all NFSv4 clients were done with them, rather than waiting until
the filecache LRU had garbage collected them.

It gets the reference counting wrong, for one thing.

But it also misses that DELEGRETURN should release a file in the
same fashion. In fact, any nfsd_file_put() on an file held open
by an NFSv4 client needs potentially to release the file
immediately...

Clear the way for implementing that idea.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: NeilBrown <neilb@suse.de>
---
 fs/nfsd/filecache.c | 18 ------------------
 fs/nfsd/filecache.h |  1 -
 fs/nfsd/nfs4state.c |  4 ++--
 3 files changed, 2 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index ec3fceb92236..babea79d3f6f 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -444,24 +444,6 @@ nfsd_file_put(struct nfsd_file *nf)
 		nfsd_file_put_noref(nf);
 }
 
-/**
- * nfsd_file_close - Close an nfsd_file
- * @nf: nfsd_file to close
- *
- * If this is the final reference for @nf, free it immediately.
- * This reflects an on-the-wire CLOSE or DELEGRETURN into the
- * VFS and exported filesystem.
- */
-void nfsd_file_close(struct nfsd_file *nf)
-{
-	nfsd_file_put(nf);
-	if (refcount_dec_if_one(&nf->nf_ref)) {
-		nfsd_file_unhash(nf);
-		nfsd_file_lru_remove(nf);
-		nfsd_file_free(nf);
-	}
-}
-
 struct nfsd_file *
 nfsd_file_get(struct nfsd_file *nf)
 {
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index 357832bac736..6b012ea4bd9d 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -52,7 +52,6 @@ void nfsd_file_cache_shutdown(void);
 int nfsd_file_cache_start_net(struct net *net);
 void nfsd_file_cache_shutdown_net(struct net *net);
 void nfsd_file_put(struct nfsd_file *nf);
-void nfsd_file_close(struct nfsd_file *nf);
 struct nfsd_file *nfsd_file_get(struct nfsd_file *nf);
 void nfsd_file_close_inode_sync(struct inode *inode);
 bool nfsd_file_is_cached(struct inode *inode);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 52b5552d0d70..16c3e991ddcc 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -842,9 +842,9 @@ static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag)
 			swap(f2, fp->fi_fds[O_RDWR]);
 		spin_unlock(&fp->fi_lock);
 		if (f1)
-			nfsd_file_close(f1);
+			nfsd_file_put(f1);
 		if (f2)
-			nfsd_file_close(f2);
+			nfsd_file_put(f2);
 	}
 }
 
-- 
cgit 


From 4d1ea8455716ca070e3cd85767e6f6a562a58b1b Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 28 Oct 2022 10:46:51 -0400
Subject: NFSD: Add an NFSD_FILE_GC flag to enable nfsd_file garbage collection

NFSv4 operations manage the lifetime of nfsd_file items they use by
means of NFSv4 OPEN and CLOSE. Hence there's no need for them to be
garbage collected.

Introduce a mechanism to enable garbage collection for nfsd_file
items used only by NFSv2/3 callers.

Note that the change in nfsd_file_put() ensures that both CLOSE and
DELEGRETURN will actually close out and free an nfsd_file on last
reference of a non-garbage-collected file.

Link: https://bugzilla.linux-nfs.org/show_bug.cgi?id=394
Suggested-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/filecache.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++-------
 fs/nfsd/filecache.h |  3 +++
 fs/nfsd/nfs3proc.c  |  4 ++--
 fs/nfsd/trace.h     |  3 ++-
 fs/nfsd/vfs.c       |  4 ++--
 5 files changed, 64 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index babea79d3f6f..cee44405cf7d 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -63,6 +63,7 @@ struct nfsd_file_lookup_key {
 	struct net			*net;
 	const struct cred		*cred;
 	unsigned char			need;
+	bool				gc;
 	enum nfsd_file_lookup_type	type;
 };
 
@@ -162,6 +163,8 @@ static int nfsd_file_obj_cmpfn(struct rhashtable_compare_arg *arg,
 			return 1;
 		if (!nfsd_match_cred(nf->nf_cred, key->cred))
 			return 1;
+		if (!!test_bit(NFSD_FILE_GC, &nf->nf_flags) != key->gc)
+			return 1;
 		if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0)
 			return 1;
 		break;
@@ -297,6 +300,8 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
 		nf->nf_flags = 0;
 		__set_bit(NFSD_FILE_HASHED, &nf->nf_flags);
 		__set_bit(NFSD_FILE_PENDING, &nf->nf_flags);
+		if (key->gc)
+			__set_bit(NFSD_FILE_GC, &nf->nf_flags);
 		nf->nf_inode = key->inode;
 		/* nf_ref is pre-incremented for hash table */
 		refcount_set(&nf->nf_ref, 2);
@@ -428,16 +433,27 @@ nfsd_file_put_noref(struct nfsd_file *nf)
 	}
 }
 
+static void
+nfsd_file_unhash_and_put(struct nfsd_file *nf)
+{
+	if (nfsd_file_unhash(nf))
+		nfsd_file_put_noref(nf);
+}
+
 void
 nfsd_file_put(struct nfsd_file *nf)
 {
 	might_sleep();
 
-	nfsd_file_lru_add(nf);
-	if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) {
+	if (test_bit(NFSD_FILE_GC, &nf->nf_flags))
+		nfsd_file_lru_add(nf);
+	else if (refcount_read(&nf->nf_ref) == 2)
+		nfsd_file_unhash_and_put(nf);
+
+	if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
 		nfsd_file_flush(nf);
 		nfsd_file_put_noref(nf);
-	} else if (nf->nf_file) {
+	} else if (nf->nf_file && test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
 		nfsd_file_put_noref(nf);
 		nfsd_file_schedule_laundrette();
 	} else
@@ -1016,12 +1032,14 @@ nfsd_file_is_cached(struct inode *inode)
 
 static __be32
 nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
-		     unsigned int may_flags, struct nfsd_file **pnf, bool open)
+		     unsigned int may_flags, struct nfsd_file **pnf,
+		     bool open, bool want_gc)
 {
 	struct nfsd_file_lookup_key key = {
 		.type	= NFSD_FILE_KEY_FULL,
 		.need	= may_flags & NFSD_FILE_MAY_MASK,
 		.net	= SVC_NET(rqstp),
+		.gc	= want_gc,
 	};
 	bool open_retry = true;
 	struct nfsd_file *nf;
@@ -1117,14 +1135,35 @@ open_file:
 	 * then unhash.
 	 */
 	if (status != nfs_ok || key.inode->i_nlink == 0)
-		if (nfsd_file_unhash(nf))
-			nfsd_file_put_noref(nf);
+		nfsd_file_unhash_and_put(nf);
 	clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags);
 	smp_mb__after_atomic();
 	wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING);
 	goto out;
 }
 
+/**
+ * nfsd_file_acquire_gc - Get a struct nfsd_file with an open file
+ * @rqstp: the RPC transaction being executed
+ * @fhp: the NFS filehandle of the file to be opened
+ * @may_flags: NFSD_MAY_ settings for the file
+ * @pnf: OUT: new or found "struct nfsd_file" object
+ *
+ * The nfsd_file object returned by this API is reference-counted
+ * and garbage-collected. The object is retained for a few
+ * seconds after the final nfsd_file_put() in case the caller
+ * wants to re-use it.
+ *
+ * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in
+ * network byte order is returned.
+ */
+__be32
+nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp,
+		     unsigned int may_flags, struct nfsd_file **pnf)
+{
+	return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, true, true);
+}
+
 /**
  * nfsd_file_acquire - Get a struct nfsd_file with an open file
  * @rqstp: the RPC transaction being executed
@@ -1132,6 +1171,10 @@ open_file:
  * @may_flags: NFSD_MAY_ settings for the file
  * @pnf: OUT: new or found "struct nfsd_file" object
  *
+ * The nfsd_file_object returned by this API is reference-counted
+ * but not garbage-collected. The object is unhashed after the
+ * final nfsd_file_put().
+ *
  * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in
  * network byte order is returned.
  */
@@ -1139,7 +1182,7 @@ __be32
 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		  unsigned int may_flags, struct nfsd_file **pnf)
 {
-	return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, true);
+	return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, true, false);
 }
 
 /**
@@ -1149,6 +1192,10 @@ nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
  * @may_flags: NFSD_MAY_ settings for the file
  * @pnf: OUT: new or found "struct nfsd_file" object
  *
+ * The nfsd_file_object returned by this API is reference-counted
+ * but not garbage-collected. The object is released immediately
+ * one RCU grace period after the final nfsd_file_put().
+ *
  * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in
  * network byte order is returned.
  */
@@ -1156,7 +1203,7 @@ __be32
 nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		 unsigned int may_flags, struct nfsd_file **pnf)
 {
-	return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, false);
+	return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, false, false);
 }
 
 /*
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index 6b012ea4bd9d..b7efb2c3ddb1 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -38,6 +38,7 @@ struct nfsd_file {
 #define NFSD_FILE_HASHED	(0)
 #define NFSD_FILE_PENDING	(1)
 #define NFSD_FILE_REFERENCED	(2)
+#define NFSD_FILE_GC		(3)
 	unsigned long		nf_flags;
 	struct inode		*nf_inode;	/* don't deref */
 	refcount_t		nf_ref;
@@ -55,6 +56,8 @@ void nfsd_file_put(struct nfsd_file *nf);
 struct nfsd_file *nfsd_file_get(struct nfsd_file *nf);
 void nfsd_file_close_inode_sync(struct inode *inode);
 bool nfsd_file_is_cached(struct inode *inode);
+__be32 nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp,
+		  unsigned int may_flags, struct nfsd_file **nfp);
 __be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		  unsigned int may_flags, struct nfsd_file **nfp);
 __be32 nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index ff2920546333..d01b29aba662 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -772,8 +772,8 @@ nfsd3_proc_commit(struct svc_rqst *rqstp)
 				(unsigned long long) argp->offset);
 
 	fh_copy(&resp->fh, &argp->fh);
-	resp->status = nfsd_file_acquire(rqstp, &resp->fh, NFSD_MAY_WRITE |
-					 NFSD_MAY_NOT_BREAK_LEASE, &nf);
+	resp->status = nfsd_file_acquire_gc(rqstp, &resp->fh, NFSD_MAY_WRITE |
+					    NFSD_MAY_NOT_BREAK_LEASE, &nf);
 	if (resp->status)
 		goto out;
 	resp->status = nfsd_commit(rqstp, &resp->fh, nf, argp->offset,
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index d4b6839bb459..3fcfeb7b560f 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -817,7 +817,8 @@ DEFINE_CLID_EVENT(confirmed_r);
 	__print_flags(val, "|",						\
 		{ 1 << NFSD_FILE_HASHED,	"HASHED" },		\
 		{ 1 << NFSD_FILE_PENDING,	"PENDING" },		\
-		{ 1 << NFSD_FILE_REFERENCED,	"REFERENCED"})
+		{ 1 << NFSD_FILE_REFERENCED,	"REFERENCED"},		\
+		{ 1 << NFSD_FILE_GC,		"GC"})
 
 DECLARE_EVENT_CLASS(nfsd_file_class,
 	TP_PROTO(struct nfsd_file *nf),
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 6f39d6c38116..961e411080f9 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1148,7 +1148,7 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	__be32 err;
 
 	trace_nfsd_read_start(rqstp, fhp, offset, *count);
-	err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf);
+	err = nfsd_file_acquire_gc(rqstp, fhp, NFSD_MAY_READ, &nf);
 	if (err)
 		return err;
 
@@ -1180,7 +1180,7 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
 
 	trace_nfsd_write_start(rqstp, fhp, offset, *cnt);
 
-	err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_WRITE, &nf);
+	err = nfsd_file_acquire_gc(rqstp, fhp, NFSD_MAY_WRITE, &nf);
 	if (err)
 		goto out;
 
-- 
cgit 


From b3276c1f5b268ff56622e9e125b792b4c3dc03ac Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 1 Nov 2022 13:30:46 -0400
Subject: NFSD: Flesh out a documenting comment for filecache.c

Record what we've learned recently about the NFSD filecache in a
documenting comment so our future selves don't forget what all this
is for.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/filecache.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index cee44405cf7d..897dd47ffb0a 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -2,6 +2,30 @@
  * Open file cache.
  *
  * (c) 2015 - Jeff Layton <jeff.layton@primarydata.com>
+ *
+ * An nfsd_file object is a per-file collection of open state that binds
+ * together:
+ *   - a struct file *
+ *   - a user credential
+ *   - a network namespace
+ *   - a read-ahead context
+ *   - monitoring for writeback errors
+ *
+ * nfsd_file objects are reference-counted. Consumers acquire a new
+ * object via the nfsd_file_acquire API. They manage their interest in
+ * the acquired object, and hence the object's reference count, via
+ * nfsd_file_get and nfsd_file_put. There are two varieties of nfsd_file
+ * object:
+ *
+ *  * non-garbage-collected: When a consumer wants to precisely control
+ *    the lifetime of a file's open state, it acquires a non-garbage-
+ *    collected nfsd_file. The final nfsd_file_put releases the open
+ *    state immediately.
+ *
+ *  * garbage-collected: When a consumer does not control the lifetime
+ *    of open state, it acquires a garbage-collected nfsd_file. The
+ *    final nfsd_file_put allows the open state to linger for a period
+ *    during which it may be re-used.
  */
 
 #include <linux/hash.h>
-- 
cgit 


From eeff73f7c1c583f79a401284f46c619294859310 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 28 Oct 2022 10:46:57 -0400
Subject: NFSD: Clean up nfs4_preprocess_stateid_op() call sites

Remove the lame-duck dprintk()s around nfs4_preprocess_stateid_op()
call sites.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: NeilBrown <neilb@suse.de>
---
 fs/nfsd/nfs4proc.c | 31 +++++++------------------------
 1 file changed, 7 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 42db62413890..6f7e0c5e62d2 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -943,12 +943,7 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
 					&read->rd_stateid, RD_STATE,
 					&read->rd_nf, NULL);
-	if (status) {
-		dprintk("NFSD: nfsd4_read: couldn't process stateid!\n");
-		goto out;
-	}
-	status = nfs_ok;
-out:
+
 	read->rd_rqstp = rqstp;
 	read->rd_fhp = &cstate->current_fh;
 	return status;
@@ -1117,10 +1112,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		status = nfs4_preprocess_stateid_op(rqstp, cstate,
 				&cstate->current_fh, &setattr->sa_stateid,
 				WR_STATE, NULL, NULL);
-		if (status) {
-			dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
+		if (status)
 			return status;
-		}
 	}
 	err = fh_want_write(&cstate->current_fh);
 	if (err)
@@ -1168,10 +1161,8 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 			       write->wr_offset, cnt);
 	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
 						stateid, WR_STATE, &nf, NULL);
-	if (status) {
-		dprintk("NFSD: nfsd4_write: couldn't process stateid!\n");
+	if (status)
 		return status;
-	}
 
 	write->wr_how_written = write->wr_stable_how;
 
@@ -1202,17 +1193,13 @@ nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->save_fh,
 					    src_stateid, RD_STATE, src, NULL);
-	if (status) {
-		dprintk("NFSD: %s: couldn't process src stateid!\n", __func__);
+	if (status)
 		goto out;
-	}
 
 	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
 					    dst_stateid, WR_STATE, dst, NULL);
-	if (status) {
-		dprintk("NFSD: %s: couldn't process dst stateid!\n", __func__);
+	if (status)
 		goto out_put_src;
-	}
 
 	/* fix up for NFS-specific error code */
 	if (!S_ISREG(file_inode((*src)->nf_file)->i_mode) ||
@@ -1957,10 +1944,8 @@ nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
 					    &fallocate->falloc_stateid,
 					    WR_STATE, &nf, NULL);
-	if (status != nfs_ok) {
-		dprintk("NFSD: nfsd4_fallocate: couldn't process stateid!\n");
+	if (status != nfs_ok)
 		return status;
-	}
 
 	status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, nf->nf_file,
 				     fallocate->falloc_offset,
@@ -2016,10 +2001,8 @@ nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
 					    &seek->seek_stateid,
 					    RD_STATE, &nf, NULL);
-	if (status) {
-		dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n");
+	if (status)
 		return status;
-	}
 
 	switch (seek->seek_whence) {
 	case NFS4_CONTENT_DATA:
-- 
cgit 


From 20eee313ff4b8a7e71ae9560f5c4ba27cd763005 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 28 Oct 2022 10:47:03 -0400
Subject: NFSD: Trace stateids returned via DELEGRETURN

Handing out a delegation stateid is recorded with the
nfsd_deleg_read tracepoint, but there isn't a matching tracepoint
for recording when the stateid is returned.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4state.c | 1 +
 fs/nfsd/trace.h     | 1 +
 2 files changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 16c3e991ddcc..9ec66611a458 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -6917,6 +6917,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (status)
 		goto put_stateid;
 
+	trace_nfsd_deleg_return(stateid);
 	wake_up_var(d_inode(cstate->current_fh.fh_dentry));
 	destroy_delegation(dp);
 put_stateid:
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 3fcfeb7b560f..00908f75e0a6 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -604,6 +604,7 @@ DEFINE_STATEID_EVENT(layout_recall_release);
 
 DEFINE_STATEID_EVENT(open);
 DEFINE_STATEID_EVENT(deleg_read);
+DEFINE_STATEID_EVENT(deleg_return);
 DEFINE_STATEID_EVENT(deleg_recall);
 
 DECLARE_EVENT_CLASS(nfsd_stateseqid_class,
-- 
cgit 


From a1c74569bbde91299f24535abf711be5c84df9de Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 28 Oct 2022 10:47:09 -0400
Subject: NFSD: Trace delegation revocations

Delegation revocation is an exceptional event that is not otherwise
visible externally (eg, no network traffic is emitted). Generate a
trace record when it occurs so that revocation can be observed or
other activity can be triggered. Example:

nfsd-1104  [005]  1912.002544: nfsd_stid_revoke:        client 633c9343:4e82788d stateid 00000003:00000001 ref=2 type=DELEG

Trace infrastructure is provided for subsequent additional tracing
related to nfs4_stid activity.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4state.c |  2 ++
 fs/nfsd/trace.h     | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 9ec66611a458..7c131cac2705 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1366,6 +1366,8 @@ static void revoke_delegation(struct nfs4_delegation *dp)
 
 	WARN_ON(!list_empty(&dp->dl_recall_lru));
 
+	trace_nfsd_stid_revoke(&dp->dl_stid);
+
 	if (clp->cl_minorversion) {
 		dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID;
 		refcount_inc(&dp->dl_stid.sc_count);
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 00908f75e0a6..b648db4d0346 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -637,6 +637,61 @@ DEFINE_EVENT(nfsd_stateseqid_class, nfsd_##name, \
 DEFINE_STATESEQID_EVENT(preprocess);
 DEFINE_STATESEQID_EVENT(open_confirm);
 
+TRACE_DEFINE_ENUM(NFS4_OPEN_STID);
+TRACE_DEFINE_ENUM(NFS4_LOCK_STID);
+TRACE_DEFINE_ENUM(NFS4_DELEG_STID);
+TRACE_DEFINE_ENUM(NFS4_CLOSED_STID);
+TRACE_DEFINE_ENUM(NFS4_REVOKED_DELEG_STID);
+TRACE_DEFINE_ENUM(NFS4_CLOSED_DELEG_STID);
+TRACE_DEFINE_ENUM(NFS4_LAYOUT_STID);
+
+#define show_stid_type(x)						\
+	__print_flags(x, "|",						\
+		{ NFS4_OPEN_STID,		"OPEN" },		\
+		{ NFS4_LOCK_STID,		"LOCK" },		\
+		{ NFS4_DELEG_STID,		"DELEG" },		\
+		{ NFS4_CLOSED_STID,		"CLOSED" },		\
+		{ NFS4_REVOKED_DELEG_STID,	"REVOKED" },		\
+		{ NFS4_CLOSED_DELEG_STID,	"CLOSED_DELEG" },	\
+		{ NFS4_LAYOUT_STID,		"LAYOUT" })
+
+DECLARE_EVENT_CLASS(nfsd_stid_class,
+	TP_PROTO(
+		const struct nfs4_stid *stid
+	),
+	TP_ARGS(stid),
+	TP_STRUCT__entry(
+		__field(unsigned long, sc_type)
+		__field(int, sc_count)
+		__field(u32, cl_boot)
+		__field(u32, cl_id)
+		__field(u32, si_id)
+		__field(u32, si_generation)
+	),
+	TP_fast_assign(
+		const stateid_t *stp = &stid->sc_stateid;
+
+		__entry->sc_type = stid->sc_type;
+		__entry->sc_count = refcount_read(&stid->sc_count);
+		__entry->cl_boot = stp->si_opaque.so_clid.cl_boot;
+		__entry->cl_id = stp->si_opaque.so_clid.cl_id;
+		__entry->si_id = stp->si_opaque.so_id;
+		__entry->si_generation = stp->si_generation;
+	),
+	TP_printk("client %08x:%08x stateid %08x:%08x ref=%d type=%s",
+		__entry->cl_boot, __entry->cl_id,
+		__entry->si_id, __entry->si_generation,
+		__entry->sc_count, show_stid_type(__entry->sc_type)
+	)
+);
+
+#define DEFINE_STID_EVENT(name)					\
+DEFINE_EVENT(nfsd_stid_class, nfsd_stid_##name,			\
+	TP_PROTO(const struct nfs4_stid *stid),			\
+	TP_ARGS(stid))
+
+DEFINE_STID_EVENT(revoke);
+
 DECLARE_EVENT_CLASS(nfsd_clientid_class,
 	TP_PROTO(const clientid_t *clid),
 	TP_ARGS(clid),
-- 
cgit 


From b48f8056c034f28dd54668399f1d22be421b0bef Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 28 Oct 2022 10:47:16 -0400
Subject: NFSD: Use const pointers as parameters to fh_ helpers

Enable callers to use const pointers where they are able to.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: NeilBrown <neilb@suse.de>
---
 fs/nfsd/nfsfh.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index c3ae6414fc5c..513e028b0bbe 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -220,7 +220,7 @@ __be32	fh_update(struct svc_fh *);
 void	fh_put(struct svc_fh *);
 
 static __inline__ struct svc_fh *
-fh_copy(struct svc_fh *dst, struct svc_fh *src)
+fh_copy(struct svc_fh *dst, const struct svc_fh *src)
 {
 	WARN_ON(src->fh_dentry);
 
@@ -229,7 +229,7 @@ fh_copy(struct svc_fh *dst, struct svc_fh *src)
 }
 
 static inline void
-fh_copy_shallow(struct knfsd_fh *dst, struct knfsd_fh *src)
+fh_copy_shallow(struct knfsd_fh *dst, const struct knfsd_fh *src)
 {
 	dst->fh_size = src->fh_size;
 	memcpy(&dst->fh_raw, &src->fh_raw, src->fh_size);
@@ -243,7 +243,8 @@ fh_init(struct svc_fh *fhp, int maxsize)
 	return fhp;
 }
 
-static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
+static inline bool fh_match(const struct knfsd_fh *fh1,
+			    const struct knfsd_fh *fh2)
 {
 	if (fh1->fh_size != fh2->fh_size)
 		return false;
@@ -252,7 +253,8 @@ static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
 	return true;
 }
 
-static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
+static inline bool fh_fsid_match(const struct knfsd_fh *fh1,
+				 const struct knfsd_fh *fh2)
 {
 	if (fh1->fh_fsid_type != fh2->fh_fsid_type)
 		return false;
-- 
cgit 


From 3fe828caddd81e68e9d29353c6e9285a658ca056 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 28 Oct 2022 10:47:22 -0400
Subject: NFSD: Update file_hashtbl() helpers

Enable callers to use const pointers for type safety.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4state.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 7c131cac2705..baf03a31203c 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -721,7 +721,7 @@ static unsigned int ownerstr_hashval(struct xdr_netobj *ownername)
 #define FILE_HASH_BITS                   8
 #define FILE_HASH_SIZE                  (1 << FILE_HASH_BITS)
 
-static unsigned int file_hashval(struct svc_fh *fh)
+static unsigned int file_hashval(const struct svc_fh *fh)
 {
 	struct inode *inode = d_inode(fh->fh_dentry);
 
@@ -4686,7 +4686,7 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
 
 /* search file_hashtbl[] for file */
 static struct nfs4_file *
-find_file_locked(struct svc_fh *fh, unsigned int hashval)
+find_file_locked(const struct svc_fh *fh, unsigned int hashval)
 {
 	struct nfs4_file *fp;
 
-- 
cgit 


From 81a21fa3e7fdecb3c5b97014f0fc5a17d5806cae Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 28 Oct 2022 10:47:28 -0400
Subject: NFSD: Clean up nfsd4_init_file()

Name this function more consistently. I'm going to use nfsd4_file_
and nfsd4_file_hash_ for these helpers.

Change the @fh parameter to be const pointer for better type safety.

Finally, move the hash insertion operation to the caller. This is
typical for most other "init_object" type helpers, and it is where
most of the other nfs4_file hash table operations are located.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4state.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index baf03a31203c..d63a4e5439c2 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4277,11 +4277,9 @@ static struct nfs4_file *nfsd4_alloc_file(void)
 }
 
 /* OPEN Share state helper functions */
-static void nfsd4_init_file(struct svc_fh *fh, unsigned int hashval,
-				struct nfs4_file *fp)
-{
-	lockdep_assert_held(&state_lock);
 
+static void nfsd4_file_init(const struct svc_fh *fh, struct nfs4_file *fp)
+{
 	refcount_set(&fp->fi_ref, 1);
 	spin_lock_init(&fp->fi_lock);
 	INIT_LIST_HEAD(&fp->fi_stateids);
@@ -4299,7 +4297,6 @@ static void nfsd4_init_file(struct svc_fh *fh, unsigned int hashval,
 	INIT_LIST_HEAD(&fp->fi_lo_states);
 	atomic_set(&fp->fi_lo_recalls, 0);
 #endif
-	hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]);
 }
 
 void
@@ -4717,7 +4714,8 @@ static struct nfs4_file *insert_file(struct nfs4_file *new, struct svc_fh *fh,
 			fp->fi_aliased = alias_found = true;
 	}
 	if (likely(ret == NULL)) {
-		nfsd4_init_file(fh, hashval, new);
+		nfsd4_file_init(fh, new);
+		hlist_add_head_rcu(&new->fi_hash, &file_hashtbl[hashval]);
 		new->fi_aliased = alias_found;
 		ret = new;
 	}
-- 
cgit 


From 3341678f2fd6106055cead09e513fad6950a0d19 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 28 Oct 2022 10:47:34 -0400
Subject: NFSD: Add a nfsd4_file_hash_remove() helper

Refactor to relocate hash deletion operation to a helper function
that is close to most other nfs4_file data structure operations.

The "noinline" annotation will become useful in a moment when the
hlist_del_rcu() is replaced with a more complex rhash remove
operation. It also guarantees that hash remove operations can be
traced with "-p function -l remove_nfs4_file_locked".

This also simplifies the organization of forward declarations: the
to-be-added rhashtable and its param structure will be defined
/after/ put_nfs4_file().

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4state.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d63a4e5439c2..fb70cab33887 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -84,6 +84,7 @@ static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
 static void nfs4_free_ol_stateid(struct nfs4_stid *stid);
 void nfsd4_end_grace(struct nfsd_net *nn);
 static void _free_cpntf_state_locked(struct nfsd_net *nn, struct nfs4_cpntf_state *cps);
+static void nfsd4_file_hash_remove(struct nfs4_file *fi);
 
 /* Locking: */
 
@@ -591,7 +592,7 @@ put_nfs4_file(struct nfs4_file *fi)
 	might_lock(&state_lock);
 
 	if (refcount_dec_and_lock(&fi->fi_ref, &state_lock)) {
-		hlist_del_rcu(&fi->fi_hash);
+		nfsd4_file_hash_remove(fi);
 		spin_unlock(&state_lock);
 		WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate));
 		WARN_ON_ONCE(!list_empty(&fi->fi_delegations));
@@ -4749,6 +4750,11 @@ find_or_add_file(struct nfs4_file *new, struct svc_fh *fh)
 	return insert_file(new, fh, hashval);
 }
 
+static noinline_for_stack void nfsd4_file_hash_remove(struct nfs4_file *fi)
+{
+	hlist_del_rcu(&fi->fi_hash);
+}
+
 /*
  * Called to check deny when READ with all zero stateid or
  * WRITE with all zero or all one stateid
-- 
cgit 


From 9270fc514ba7d415636b23bcb937573a1ce54f6a Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 28 Oct 2022 10:47:41 -0400
Subject: NFSD: Clean up find_or_add_file()

Remove the call to find_file_locked() in insert_nfs4_file(). Tracing
shows that over 99% of these calls return NULL. Thus it is not worth
the expense of the extra bucket list traversal. insert_file() already
deals correctly with the case where the item is already in the hash
bucket.

Since nfsd4_file_hash_insert() is now just a wrapper around
insert_file(), move the meat of insert_file() into
nfsd4_file_hash_insert() and get rid of it.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: NeilBrown <neilb@suse.de>
---
 fs/nfsd/nfs4state.c | 64 +++++++++++++++++++++++------------------------------
 1 file changed, 28 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index fb70cab33887..d63ce2118042 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4698,24 +4698,42 @@ find_file_locked(const struct svc_fh *fh, unsigned int hashval)
 	return NULL;
 }
 
-static struct nfs4_file *insert_file(struct nfs4_file *new, struct svc_fh *fh,
-				     unsigned int hashval)
+static struct nfs4_file * find_file(struct svc_fh *fh)
 {
 	struct nfs4_file *fp;
+	unsigned int hashval = file_hashval(fh);
+
+	rcu_read_lock();
+	fp = find_file_locked(fh, hashval);
+	rcu_read_unlock();
+	return fp;
+}
+
+/*
+ * On hash insertion, identify entries with the same inode but
+ * distinct filehandles. They will all be in the same hash bucket
+ * because nfs4_file's are hashed by the address in the fi_inode
+ * field.
+ */
+static noinline_for_stack struct nfs4_file *
+nfsd4_file_hash_insert(struct nfs4_file *new, const struct svc_fh *fhp)
+{
+	unsigned int hashval = file_hashval(fhp);
 	struct nfs4_file *ret = NULL;
 	bool alias_found = false;
+	struct nfs4_file *fi;
 
 	spin_lock(&state_lock);
-	hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash,
+	hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash,
 				 lockdep_is_held(&state_lock)) {
-		if (fh_match(&fp->fi_fhandle, &fh->fh_handle)) {
-			if (refcount_inc_not_zero(&fp->fi_ref))
-				ret = fp;
-		} else if (d_inode(fh->fh_dentry) == fp->fi_inode)
-			fp->fi_aliased = alias_found = true;
+		if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
+			if (refcount_inc_not_zero(&fi->fi_ref))
+				ret = fi;
+		} else if (d_inode(fhp->fh_dentry) == fi->fi_inode)
+			fi->fi_aliased = alias_found = true;
 	}
 	if (likely(ret == NULL)) {
-		nfsd4_file_init(fh, new);
+		nfsd4_file_init(fhp, new);
 		hlist_add_head_rcu(&new->fi_hash, &file_hashtbl[hashval]);
 		new->fi_aliased = alias_found;
 		ret = new;
@@ -4724,32 +4742,6 @@ static struct nfs4_file *insert_file(struct nfs4_file *new, struct svc_fh *fh,
 	return ret;
 }
 
-static struct nfs4_file * find_file(struct svc_fh *fh)
-{
-	struct nfs4_file *fp;
-	unsigned int hashval = file_hashval(fh);
-
-	rcu_read_lock();
-	fp = find_file_locked(fh, hashval);
-	rcu_read_unlock();
-	return fp;
-}
-
-static struct nfs4_file *
-find_or_add_file(struct nfs4_file *new, struct svc_fh *fh)
-{
-	struct nfs4_file *fp;
-	unsigned int hashval = file_hashval(fh);
-
-	rcu_read_lock();
-	fp = find_file_locked(fh, hashval);
-	rcu_read_unlock();
-	if (fp)
-		return fp;
-
-	return insert_file(new, fh, hashval);
-}
-
 static noinline_for_stack void nfsd4_file_hash_remove(struct nfs4_file *fi)
 {
 	hlist_del_rcu(&fi->fi_hash);
@@ -5641,7 +5633,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	 * and check for delegations in the process of being recalled.
 	 * If not found, create the nfs4_file struct
 	 */
-	fp = find_or_add_file(open->op_file, current_fh);
+	fp = nfsd4_file_hash_insert(open->op_file, current_fh);
 	if (fp != open->op_file) {
 		status = nfs4_check_deleg(cl, open, &dp);
 		if (status)
-- 
cgit 


From 15424748001a9b5ea62b3e6ad45f0a8b27f01df9 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 28 Oct 2022 10:47:47 -0400
Subject: NFSD: Refactor find_file()

find_file() is now the only caller of find_file_locked(), so just
fold these two together.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4state.c | 36 +++++++++++++++---------------------
 1 file changed, 15 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d63ce2118042..a611e7b0f013 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4682,31 +4682,24 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
 		nfs4_put_stid(&last->st_stid);
 }
 
-/* search file_hashtbl[] for file */
-static struct nfs4_file *
-find_file_locked(const struct svc_fh *fh, unsigned int hashval)
+static noinline_for_stack struct nfs4_file *
+nfsd4_file_hash_lookup(const struct svc_fh *fhp)
 {
-	struct nfs4_file *fp;
+	unsigned int hashval = file_hashval(fhp);
+	struct nfs4_file *fi;
 
-	hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash,
-				lockdep_is_held(&state_lock)) {
-		if (fh_match(&fp->fi_fhandle, &fh->fh_handle)) {
-			if (refcount_inc_not_zero(&fp->fi_ref))
-				return fp;
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash,
+				 lockdep_is_held(&state_lock)) {
+		if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
+			if (refcount_inc_not_zero(&fi->fi_ref)) {
+				rcu_read_unlock();
+				return fi;
+			}
 		}
 	}
-	return NULL;
-}
-
-static struct nfs4_file * find_file(struct svc_fh *fh)
-{
-	struct nfs4_file *fp;
-	unsigned int hashval = file_hashval(fh);
-
-	rcu_read_lock();
-	fp = find_file_locked(fh, hashval);
 	rcu_read_unlock();
-	return fp;
+	return NULL;
 }
 
 /*
@@ -4757,9 +4750,10 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
 	struct nfs4_file *fp;
 	__be32 ret = nfs_ok;
 
-	fp = find_file(current_fh);
+	fp = nfsd4_file_hash_lookup(current_fh);
 	if (!fp)
 		return ret;
+
 	/* Check for conflicting share reservations */
 	spin_lock(&fp->fi_lock);
 	if (fp->fi_share_deny & deny_type)
-- 
cgit 


From d47b295e8d76a4d69f0e2ea0cd8a79c9d3488280 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 28 Oct 2022 10:47:53 -0400
Subject: NFSD: Use rhashtable for managing nfs4_file objects

fh_match() is costly, especially when filehandles are large (as is
the case for NFSv4). It needs to be used sparingly when searching
data structures. Unfortunately, with common workloads, I see
multiple thousands of objects stored in file_hashtbl[], which has
just 256 buckets, making its bucket hash chains quite lengthy.

Walking long hash chains with the state_lock held blocks other
activity that needs that lock. Sizable hash chains are a common
occurrance once the server has handed out some delegations, for
example -- IIUC, each delegated file is held open on the server by
an nfs4_file object.

To help mitigate the cost of searching with fh_match(), replace the
nfs4_file hash table with an rhashtable, which can dynamically
resize its bucket array to minimize hash chain length.

The result of this modification is an improvement in the latency of
NFSv4 operations, and the reduction of nfsd CPU utilization due to
eliminating the cost of multiple calls to fh_match() and reducing
the CPU cache misses incurred while walking long hash chains in the
nfs4_file hash table.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4state.c | 97 ++++++++++++++++++++++++++++++++++-------------------
 fs/nfsd/state.h     |  5 +--
 2 files changed, 63 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index a611e7b0f013..2ec981fd2985 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -44,7 +44,9 @@
 #include <linux/jhash.h>
 #include <linux/string_helpers.h>
 #include <linux/fsnotify.h>
+#include <linux/rhashtable.h>
 #include <linux/nfs_ssc.h>
+
 #include "xdr4.h"
 #include "xdr4cb.h"
 #include "vfs.h"
@@ -589,11 +591,8 @@ static void nfsd4_free_file_rcu(struct rcu_head *rcu)
 void
 put_nfs4_file(struct nfs4_file *fi)
 {
-	might_lock(&state_lock);
-
-	if (refcount_dec_and_lock(&fi->fi_ref, &state_lock)) {
+	if (refcount_dec_and_test(&fi->fi_ref)) {
 		nfsd4_file_hash_remove(fi);
-		spin_unlock(&state_lock);
 		WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate));
 		WARN_ON_ONCE(!list_empty(&fi->fi_delegations));
 		call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu);
@@ -718,19 +717,20 @@ static unsigned int ownerstr_hashval(struct xdr_netobj *ownername)
 	return ret & OWNER_HASH_MASK;
 }
 
-/* hash table for nfs4_file */
-#define FILE_HASH_BITS                   8
-#define FILE_HASH_SIZE                  (1 << FILE_HASH_BITS)
-
-static unsigned int file_hashval(const struct svc_fh *fh)
-{
-	struct inode *inode = d_inode(fh->fh_dentry);
+static struct rhltable nfs4_file_rhltable ____cacheline_aligned_in_smp;
 
-	/* XXX: why not (here & in file cache) use inode? */
-	return (unsigned int)hash_long(inode->i_ino, FILE_HASH_BITS);
-}
+static const struct rhashtable_params nfs4_file_rhash_params = {
+	.key_len		= sizeof_field(struct nfs4_file, fi_inode),
+	.key_offset		= offsetof(struct nfs4_file, fi_inode),
+	.head_offset		= offsetof(struct nfs4_file, fi_rlist),
 
-static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
+	/*
+	 * Start with a single page hash table to reduce resizing churn
+	 * on light workloads.
+	 */
+	.min_size		= 256,
+	.automatic_shrinking	= true,
+};
 
 /*
  * Check if courtesy clients have conflicting access and resolve it if possible
@@ -4685,12 +4685,14 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
 static noinline_for_stack struct nfs4_file *
 nfsd4_file_hash_lookup(const struct svc_fh *fhp)
 {
-	unsigned int hashval = file_hashval(fhp);
+	struct inode *inode = d_inode(fhp->fh_dentry);
+	struct rhlist_head *tmp, *list;
 	struct nfs4_file *fi;
 
 	rcu_read_lock();
-	hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash,
-				 lockdep_is_held(&state_lock)) {
+	list = rhltable_lookup(&nfs4_file_rhltable, &inode,
+			       nfs4_file_rhash_params);
+	rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) {
 		if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
 			if (refcount_inc_not_zero(&fi->fi_ref)) {
 				rcu_read_unlock();
@@ -4704,40 +4706,56 @@ nfsd4_file_hash_lookup(const struct svc_fh *fhp)
 
 /*
  * On hash insertion, identify entries with the same inode but
- * distinct filehandles. They will all be in the same hash bucket
- * because nfs4_file's are hashed by the address in the fi_inode
- * field.
+ * distinct filehandles. They will all be on the list returned
+ * by rhltable_lookup().
+ *
+ * inode->i_lock prevents racing insertions from adding an entry
+ * for the same inode/fhp pair twice.
  */
 static noinline_for_stack struct nfs4_file *
 nfsd4_file_hash_insert(struct nfs4_file *new, const struct svc_fh *fhp)
 {
-	unsigned int hashval = file_hashval(fhp);
+	struct inode *inode = d_inode(fhp->fh_dentry);
+	struct rhlist_head *tmp, *list;
 	struct nfs4_file *ret = NULL;
 	bool alias_found = false;
 	struct nfs4_file *fi;
+	int err;
 
-	spin_lock(&state_lock);
-	hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash,
-				 lockdep_is_held(&state_lock)) {
+	rcu_read_lock();
+	spin_lock(&inode->i_lock);
+
+	list = rhltable_lookup(&nfs4_file_rhltable, &inode,
+			       nfs4_file_rhash_params);
+	rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) {
 		if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
 			if (refcount_inc_not_zero(&fi->fi_ref))
 				ret = fi;
-		} else if (d_inode(fhp->fh_dentry) == fi->fi_inode)
+		} else
 			fi->fi_aliased = alias_found = true;
 	}
-	if (likely(ret == NULL)) {
-		nfsd4_file_init(fhp, new);
-		hlist_add_head_rcu(&new->fi_hash, &file_hashtbl[hashval]);
-		new->fi_aliased = alias_found;
-		ret = new;
-	}
-	spin_unlock(&state_lock);
+	if (ret)
+		goto out_unlock;
+
+	nfsd4_file_init(fhp, new);
+	err = rhltable_insert(&nfs4_file_rhltable, &new->fi_rlist,
+			      nfs4_file_rhash_params);
+	if (err)
+		goto out_unlock;
+
+	new->fi_aliased = alias_found;
+	ret = new;
+
+out_unlock:
+	spin_unlock(&inode->i_lock);
+	rcu_read_unlock();
 	return ret;
 }
 
 static noinline_for_stack void nfsd4_file_hash_remove(struct nfs4_file *fi)
 {
-	hlist_del_rcu(&fi->fi_hash);
+	rhltable_remove(&nfs4_file_rhltable, &fi->fi_rlist,
+			nfs4_file_rhash_params);
 }
 
 /*
@@ -5628,6 +5646,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	 * If not found, create the nfs4_file struct
 	 */
 	fp = nfsd4_file_hash_insert(open->op_file, current_fh);
+	if (unlikely(!fp))
+		return nfserr_jukebox;
 	if (fp != open->op_file) {
 		status = nfs4_check_deleg(cl, open, &dp);
 		if (status)
@@ -8042,10 +8062,16 @@ nfs4_state_start(void)
 {
 	int ret;
 
-	ret = nfsd4_create_callback_queue();
+	ret = rhltable_init(&nfs4_file_rhltable, &nfs4_file_rhash_params);
 	if (ret)
 		return ret;
 
+	ret = nfsd4_create_callback_queue();
+	if (ret) {
+		rhltable_destroy(&nfs4_file_rhltable);
+		return ret;
+	}
+
 	set_max_delegations();
 	return 0;
 }
@@ -8076,6 +8102,7 @@ nfs4_state_shutdown_net(struct net *net)
 
 	nfsd4_client_tracking_exit(net);
 	nfs4_state_destroy_net(net);
+	rhltable_destroy(&nfs4_file_rhltable);
 #ifdef CONFIG_NFSD_V4_2_INTER_SSC
 	nfsd4_ssc_shutdown_umount(nn);
 #endif
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index e2daef3cc003..eadd7f465bf5 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -536,16 +536,13 @@ struct nfs4_clnt_odstate {
  * inode can have multiple filehandles associated with it, so there is
  * (potentially) a many to one relationship between this struct and struct
  * inode.
- *
- * These are hashed by filehandle in the file_hashtbl, which is protected by
- * the global state_lock spinlock.
  */
 struct nfs4_file {
 	refcount_t		fi_ref;
 	struct inode *		fi_inode;
 	bool			fi_aliased;
 	spinlock_t		fi_lock;
-	struct hlist_node       fi_hash;	/* hash on fi_fhandle */
+	struct rhlist_head	fi_rlist;
 	struct list_head        fi_stateids;
 	union {
 		struct list_head	fi_delegations;
-- 
cgit 


From 3f054211b29c0fa06dfdcab402c795fd7e906be1 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 31 Oct 2022 09:53:26 -0400
Subject: NFSD: Fix licensing header in filecache.c

Add a missing SPDX header.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/filecache.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 897dd47ffb0a..deab9b765c88 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
- * Open file cache.
+ * The NFSD open file cache.
  *
  * (c) 2015 - Jeff Layton <jeff.layton@primarydata.com>
  *
-- 
cgit 


From 1f696e230ea5198e393368b319eb55651828d687 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 2 Nov 2022 14:44:47 -0400
Subject: nfsd: remove the pages_flushed statistic from filecache

We're counting mapping->nrpages, but not all of those are necessarily
dirty. We don't really have a simple way to count just the dirty pages,
so just remove this stat since it's not accurate.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/filecache.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index deab9b765c88..2d543a452858 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -58,7 +58,6 @@ static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits);
 static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions);
 static DEFINE_PER_CPU(unsigned long, nfsd_file_releases);
 static DEFINE_PER_CPU(unsigned long, nfsd_file_total_age);
-static DEFINE_PER_CPU(unsigned long, nfsd_file_pages_flushed);
 static DEFINE_PER_CPU(unsigned long, nfsd_file_evictions);
 
 struct nfsd_fcache_disposal {
@@ -396,7 +395,6 @@ nfsd_file_flush(struct nfsd_file *nf)
 
 	if (!file || !(file->f_mode & FMODE_WRITE))
 		return;
-	this_cpu_add(nfsd_file_pages_flushed, file->f_mapping->nrpages);
 	if (vfs_fsync(file, 1) != 0)
 		nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
 }
@@ -1023,7 +1021,6 @@ nfsd_file_cache_shutdown(void)
 		per_cpu(nfsd_file_acquisitions, i) = 0;
 		per_cpu(nfsd_file_releases, i) = 0;
 		per_cpu(nfsd_file_total_age, i) = 0;
-		per_cpu(nfsd_file_pages_flushed, i) = 0;
 		per_cpu(nfsd_file_evictions, i) = 0;
 	}
 }
@@ -1238,7 +1235,7 @@ nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
  */
 int nfsd_file_cache_stats_show(struct seq_file *m, void *v)
 {
-	unsigned long releases = 0, pages_flushed = 0, evictions = 0;
+	unsigned long releases = 0, evictions = 0;
 	unsigned long hits = 0, acquisitions = 0;
 	unsigned int i, count = 0, buckets = 0;
 	unsigned long lru = 0, total_age = 0;
@@ -1266,7 +1263,6 @@ int nfsd_file_cache_stats_show(struct seq_file *m, void *v)
 		releases += per_cpu(nfsd_file_releases, i);
 		total_age += per_cpu(nfsd_file_total_age, i);
 		evictions += per_cpu(nfsd_file_evictions, i);
-		pages_flushed += per_cpu(nfsd_file_pages_flushed, i);
 	}
 
 	seq_printf(m, "total entries: %u\n", count);
@@ -1280,6 +1276,5 @@ int nfsd_file_cache_stats_show(struct seq_file *m, void *v)
 		seq_printf(m, "mean age (ms): %ld\n", total_age / releases);
 	else
 		seq_printf(m, "mean age (ms): -\n");
-	seq_printf(m, "pages flushed: %lu\n", pages_flushed);
 	return 0;
 }
-- 
cgit 


From 8214118589881b2d390284410c5ff275e7a5e03c Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 2 Nov 2022 14:44:48 -0400
Subject: nfsd: reorganize filecache.c

In a coming patch, we're going to rework how the filecache refcounting
works. Move some code around in the function to reduce the churn in the
later patches, and rename some of the functions with (hopefully) clearer
names: nfsd_file_flush becomes nfsd_file_fsync, and
nfsd_file_unhash_and_dispose is renamed to nfsd_file_unhash_and_queue.

Also, the nfsd_file_put_final tracepoint is renamed to nfsd_file_free,
to better match the name of the function from which it's called.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/filecache.c | 111 ++++++++++++++++++++++++++--------------------------
 fs/nfsd/trace.h     |   4 +-
 2 files changed, 58 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 2d543a452858..1418bc889aa0 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -335,16 +335,59 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
 	return nf;
 }
 
+static void
+nfsd_file_fsync(struct nfsd_file *nf)
+{
+	struct file *file = nf->nf_file;
+
+	if (!file || !(file->f_mode & FMODE_WRITE))
+		return;
+	if (vfs_fsync(file, 1) != 0)
+		nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
+}
+
+static int
+nfsd_file_check_write_error(struct nfsd_file *nf)
+{
+	struct file *file = nf->nf_file;
+
+	if (!file || !(file->f_mode & FMODE_WRITE))
+		return 0;
+	return filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err));
+}
+
+static void
+nfsd_file_hash_remove(struct nfsd_file *nf)
+{
+	trace_nfsd_file_unhash(nf);
+
+	if (nfsd_file_check_write_error(nf))
+		nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
+	rhashtable_remove_fast(&nfsd_file_rhash_tbl, &nf->nf_rhash,
+			       nfsd_file_rhash_params);
+}
+
+static bool
+nfsd_file_unhash(struct nfsd_file *nf)
+{
+	if (test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
+		nfsd_file_hash_remove(nf);
+		return true;
+	}
+	return false;
+}
+
 static bool
 nfsd_file_free(struct nfsd_file *nf)
 {
 	s64 age = ktime_to_ms(ktime_sub(ktime_get(), nf->nf_birthtime));
 	bool flush = false;
 
+	trace_nfsd_file_free(nf);
+
 	this_cpu_inc(nfsd_file_releases);
 	this_cpu_add(nfsd_file_total_age, age);
 
-	trace_nfsd_file_put_final(nf);
 	if (nf->nf_mark)
 		nfsd_file_mark_put(nf->nf_mark);
 	if (nf->nf_file) {
@@ -378,27 +421,6 @@ nfsd_file_check_writeback(struct nfsd_file *nf)
 		mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK);
 }
 
-static int
-nfsd_file_check_write_error(struct nfsd_file *nf)
-{
-	struct file *file = nf->nf_file;
-
-	if (!file || !(file->f_mode & FMODE_WRITE))
-		return 0;
-	return filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err));
-}
-
-static void
-nfsd_file_flush(struct nfsd_file *nf)
-{
-	struct file *file = nf->nf_file;
-
-	if (!file || !(file->f_mode & FMODE_WRITE))
-		return;
-	if (vfs_fsync(file, 1) != 0)
-		nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
-}
-
 static void nfsd_file_lru_add(struct nfsd_file *nf)
 {
 	set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
@@ -412,31 +434,18 @@ static void nfsd_file_lru_remove(struct nfsd_file *nf)
 		trace_nfsd_file_lru_del(nf);
 }
 
-static void
-nfsd_file_hash_remove(struct nfsd_file *nf)
-{
-	trace_nfsd_file_unhash(nf);
-
-	if (nfsd_file_check_write_error(nf))
-		nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
-	rhashtable_remove_fast(&nfsd_file_rhash_tbl, &nf->nf_rhash,
-			       nfsd_file_rhash_params);
-}
-
-static bool
-nfsd_file_unhash(struct nfsd_file *nf)
+struct nfsd_file *
+nfsd_file_get(struct nfsd_file *nf)
 {
-	if (test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
-		nfsd_file_hash_remove(nf);
-		return true;
-	}
-	return false;
+	if (likely(refcount_inc_not_zero(&nf->nf_ref)))
+		return nf;
+	return NULL;
 }
 
 static void
-nfsd_file_unhash_and_dispose(struct nfsd_file *nf, struct list_head *dispose)
+nfsd_file_unhash_and_queue(struct nfsd_file *nf, struct list_head *dispose)
 {
-	trace_nfsd_file_unhash_and_dispose(nf);
+	trace_nfsd_file_unhash_and_queue(nf);
 	if (nfsd_file_unhash(nf)) {
 		/* caller must call nfsd_file_dispose_list() later */
 		nfsd_file_lru_remove(nf);
@@ -474,7 +483,7 @@ nfsd_file_put(struct nfsd_file *nf)
 		nfsd_file_unhash_and_put(nf);
 
 	if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
-		nfsd_file_flush(nf);
+		nfsd_file_fsync(nf);
 		nfsd_file_put_noref(nf);
 	} else if (nf->nf_file && test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
 		nfsd_file_put_noref(nf);
@@ -483,14 +492,6 @@ nfsd_file_put(struct nfsd_file *nf)
 		nfsd_file_put_noref(nf);
 }
 
-struct nfsd_file *
-nfsd_file_get(struct nfsd_file *nf)
-{
-	if (likely(refcount_inc_not_zero(&nf->nf_ref)))
-		return nf;
-	return NULL;
-}
-
 static void
 nfsd_file_dispose_list(struct list_head *dispose)
 {
@@ -499,7 +500,7 @@ nfsd_file_dispose_list(struct list_head *dispose)
 	while(!list_empty(dispose)) {
 		nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
 		list_del_init(&nf->nf_lru);
-		nfsd_file_flush(nf);
+		nfsd_file_fsync(nf);
 		nfsd_file_put_noref(nf);
 	}
 }
@@ -513,7 +514,7 @@ nfsd_file_dispose_list_sync(struct list_head *dispose)
 	while(!list_empty(dispose)) {
 		nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
 		list_del_init(&nf->nf_lru);
-		nfsd_file_flush(nf);
+		nfsd_file_fsync(nf);
 		if (!refcount_dec_and_test(&nf->nf_ref))
 			continue;
 		if (nfsd_file_free(nf))
@@ -713,7 +714,7 @@ __nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
 				       nfsd_file_rhash_params);
 		if (!nf)
 			break;
-		nfsd_file_unhash_and_dispose(nf, dispose);
+		nfsd_file_unhash_and_queue(nf, dispose);
 		count++;
 	} while (1);
 	rcu_read_unlock();
@@ -915,7 +916,7 @@ __nfsd_file_cache_purge(struct net *net)
 		nf = rhashtable_walk_next(&iter);
 		while (!IS_ERR_OR_NULL(nf)) {
 			if (!net || nf->nf_net == net)
-				nfsd_file_unhash_and_dispose(nf, &dispose);
+				nfsd_file_unhash_and_queue(nf, &dispose);
 			nf = rhashtable_walk_next(&iter);
 		}
 
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index b648db4d0346..0177cfa50556 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -906,10 +906,10 @@ DEFINE_EVENT(nfsd_file_class, name, \
 	TP_PROTO(struct nfsd_file *nf), \
 	TP_ARGS(nf))
 
-DEFINE_NFSD_FILE_EVENT(nfsd_file_put_final);
+DEFINE_NFSD_FILE_EVENT(nfsd_file_free);
 DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash);
 DEFINE_NFSD_FILE_EVENT(nfsd_file_put);
-DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_dispose);
+DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_queue);
 
 TRACE_EVENT(nfsd_file_alloc,
 	TP_PROTO(
-- 
cgit 


From 41e8f85a75fc60e1543e4903428a1b481b672a17 Mon Sep 17 00:00:00 2001
From: Daeho Jeong <daehojeong@google.com>
Date: Fri, 11 Nov 2022 09:04:06 -0800
Subject: f2fs: introduce F2FS_IOC_START_ATOMIC_REPLACE

introduce a new ioctl to replace the whole content of a file atomically,
which means it induces truncate and content update at the same time.
We can start it with F2FS_IOC_START_ATOMIC_REPLACE and complete it with
F2FS_IOC_COMMIT_ATOMIC_WRITE. Or abort it with
F2FS_IOC_ABORT_ATOMIC_WRITE.

Signed-off-by: Daeho Jeong <daehojeong@google.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c    |  3 +++
 fs/f2fs/f2fs.h    |  1 +
 fs/f2fs/file.c    | 21 +++++++++++++++------
 fs/f2fs/segment.c | 13 ++++++++++++-
 4 files changed, 31 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 9b47ded653d1..560fa80590e9 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -3466,6 +3466,9 @@ static int prepare_atomic_write_begin(struct f2fs_sb_info *sbi,
 	else if (*blk_addr != NULL_ADDR)
 		return 0;
 
+	if (is_inode_flag_set(inode, FI_ATOMIC_REPLACE))
+		goto reserve_block;
+
 	/* Look for the block in the original inode */
 	err = __find_data_block(inode, index, &ori_blk_addr);
 	if (err)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 6a8cbf5bb187..b89b5d755ce0 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -769,6 +769,7 @@ enum {
 	FI_ALIGNED_WRITE,	/* enable aligned write */
 	FI_COW_FILE,		/* indicate COW file */
 	FI_ATOMIC_COMMITTED,	/* indicate atomic commit completed except disk sync */
+	FI_ATOMIC_REPLACE,	/* indicate atomic replace */
 	FI_MAX,			/* max flag, never be used */
 };
 
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 28f586e77999..ab0a0d3730f6 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -2034,7 +2034,7 @@ static int f2fs_ioc_getversion(struct file *filp, unsigned long arg)
 	return put_user(inode->i_generation, (int __user *)arg);
 }
 
-static int f2fs_ioc_start_atomic_write(struct file *filp)
+static int f2fs_ioc_start_atomic_write(struct file *filp, bool truncate)
 {
 	struct inode *inode = file_inode(filp);
 	struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
@@ -2103,15 +2103,22 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
 
 	f2fs_write_inode(inode, NULL);
 
-	isize = i_size_read(inode);
-	fi->original_i_size = isize;
-	f2fs_i_size_write(fi->cow_inode, isize);
-
 	stat_inc_atomic_inode(inode);
 
 	set_inode_flag(inode, FI_ATOMIC_FILE);
 	set_inode_flag(fi->cow_inode, FI_COW_FILE);
 	clear_inode_flag(fi->cow_inode, FI_INLINE_DATA);
+
+	isize = i_size_read(inode);
+	fi->original_i_size = isize;
+	if (truncate) {
+		set_inode_flag(inode, FI_ATOMIC_REPLACE);
+		truncate_inode_pages_final(inode->i_mapping);
+		f2fs_i_size_write(inode, 0);
+		isize = 0;
+	}
+	f2fs_i_size_write(fi->cow_inode, isize);
+
 	f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
 
 	f2fs_update_time(sbi, REQ_TIME);
@@ -4139,7 +4146,9 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	case FS_IOC_GETVERSION:
 		return f2fs_ioc_getversion(filp, arg);
 	case F2FS_IOC_START_ATOMIC_WRITE:
-		return f2fs_ioc_start_atomic_write(filp);
+		return f2fs_ioc_start_atomic_write(filp, false);
+	case F2FS_IOC_START_ATOMIC_REPLACE:
+		return f2fs_ioc_start_atomic_write(filp, true);
 	case F2FS_IOC_COMMIT_ATOMIC_WRITE:
 		return f2fs_ioc_commit_atomic_write(filp);
 	case F2FS_IOC_ABORT_ATOMIC_WRITE:
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 8aa81238c770..5ac026a57228 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -197,6 +197,7 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean)
 	fi->cow_inode = NULL;
 	release_atomic_write_cnt(inode);
 	clear_inode_flag(inode, FI_ATOMIC_COMMITTED);
+	clear_inode_flag(inode, FI_ATOMIC_REPLACE);
 	clear_inode_flag(inode, FI_ATOMIC_FILE);
 	stat_dec_atomic_inode(inode);
 
@@ -261,14 +262,24 @@ static void __complete_revoke_list(struct inode *inode, struct list_head *head,
 					bool revoke)
 {
 	struct revoke_entry *cur, *tmp;
+	pgoff_t start_index = 0;
+	bool truncate = is_inode_flag_set(inode, FI_ATOMIC_REPLACE);
 
 	list_for_each_entry_safe(cur, tmp, head, list) {
-		if (revoke)
+		if (revoke) {
 			__replace_atomic_write_block(inode, cur->index,
 						cur->old_addr, NULL, true);
+		} else if (truncate) {
+			f2fs_truncate_hole(inode, start_index, cur->index);
+			start_index = cur->index + 1;
+		}
+
 		list_del(&cur->list);
 		kmem_cache_free(revoke_entry_slab, cur);
 	}
+
+	if (!revoke && truncate)
+		f2fs_do_truncate_blocks(inode, start_index * PAGE_SIZE, false);
 }
 
 static int __f2fs_commit_atomic_write(struct inode *inode)
-- 
cgit 


From d3b7b4afd6b2c344eabf9cc26b8bfa903c164c7c Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Tue, 15 Nov 2022 00:08:47 +0800
Subject: f2fs: fix to do sanity check on i_extra_isize in is_alive()

syzbot found a f2fs bug:

BUG: KASAN: slab-out-of-bounds in data_blkaddr fs/f2fs/f2fs.h:2891 [inline]
BUG: KASAN: slab-out-of-bounds in is_alive fs/f2fs/gc.c:1117 [inline]
BUG: KASAN: slab-out-of-bounds in gc_data_segment fs/f2fs/gc.c:1520 [inline]
BUG: KASAN: slab-out-of-bounds in do_garbage_collect+0x386a/0x3df0 fs/f2fs/gc.c:1734
Read of size 4 at addr ffff888076557568 by task kworker/u4:3/52

CPU: 1 PID: 52 Comm: kworker/u4:3 Not tainted 6.1.0-rc4-syzkaller-00362-gfef7fd48922d #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/26/2022
Workqueue: writeback wb_workfn (flush-7:0)
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106
print_address_description mm/kasan/report.c:284 [inline]
print_report+0x15e/0x45d mm/kasan/report.c:395
kasan_report+0xbb/0x1f0 mm/kasan/report.c:495
data_blkaddr fs/f2fs/f2fs.h:2891 [inline]
is_alive fs/f2fs/gc.c:1117 [inline]
gc_data_segment fs/f2fs/gc.c:1520 [inline]
do_garbage_collect+0x386a/0x3df0 fs/f2fs/gc.c:1734
f2fs_gc+0x88c/0x20a0 fs/f2fs/gc.c:1831
f2fs_balance_fs+0x544/0x6b0 fs/f2fs/segment.c:410
f2fs_write_inode+0x57e/0xe20 fs/f2fs/inode.c:753
write_inode fs/fs-writeback.c:1440 [inline]
__writeback_single_inode+0xcfc/0x1440 fs/fs-writeback.c:1652
writeback_sb_inodes+0x54d/0xf90 fs/fs-writeback.c:1870
wb_writeback+0x2c5/0xd70 fs/fs-writeback.c:2044
wb_do_writeback fs/fs-writeback.c:2187 [inline]
wb_workfn+0x2dc/0x12f0 fs/fs-writeback.c:2227
process_one_work+0x9bf/0x1710 kernel/workqueue.c:2289
worker_thread+0x665/0x1080 kernel/workqueue.c:2436
kthread+0x2e4/0x3a0 kernel/kthread.c:376
ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:306

The root cause is that we forgot to do sanity check on .i_extra_isize
in below path, result in accessing invalid address later, fix it.
- gc_data_segment
 - is_alive
  - data_blkaddr
   - offset_in_addr

Reported-by: syzbot+f8f3dfa4abc489e768a1@syzkaller.appspotmail.com
Link: https://lore.kernel.org/linux-f2fs-devel/0000000000003cb3c405ed5c17f9@google.com/T/#u
Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/gc.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index f1a46519a5fe..0f967b1e98f2 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1077,7 +1077,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 {
 	struct page *node_page;
 	nid_t nid;
-	unsigned int ofs_in_node, max_addrs;
+	unsigned int ofs_in_node, max_addrs, base;
 	block_t source_blkaddr;
 
 	nid = le32_to_cpu(sum->nid);
@@ -1103,11 +1103,17 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 		return false;
 	}
 
-	max_addrs = IS_INODE(node_page) ? DEF_ADDRS_PER_INODE :
-						DEF_ADDRS_PER_BLOCK;
-	if (ofs_in_node >= max_addrs) {
-		f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%u, nid:%u, max:%u",
-			ofs_in_node, dni->ino, dni->nid, max_addrs);
+	if (IS_INODE(node_page)) {
+		base = offset_in_addr(F2FS_INODE(node_page));
+		max_addrs = DEF_ADDRS_PER_INODE;
+	} else {
+		base = 0;
+		max_addrs = DEF_ADDRS_PER_BLOCK;
+	}
+
+	if (base + ofs_in_node >= max_addrs) {
+		f2fs_err(sbi, "Inconsistent blkaddr offset: base:%u, ofs_in_node:%u, max:%u, ino:%u, nid:%u",
+			base, ofs_in_node, max_addrs, dni->ino, dni->nid);
 		f2fs_put_page(node_page, 1);
 		return false;
 	}
-- 
cgit 


From 5b7b74b71c7fefbaa3e0ccc120c3cbd50b3fad86 Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Sat, 12 Nov 2022 00:13:49 +0800
Subject: f2fs: remove submit label in __submit_discard_cmd()

Complaint from Matthew Wilcox in another similar place:

	"submit?  You don't submit anything at the 'submit' label.
	it should be called 'skip' or something.  But I think this
	is just badly written and you don't need a goto at all."

Let's remove submit label for readability.

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 5ac026a57228..8b0b76550578 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1143,13 +1143,12 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi,
 		if (time_to_inject(sbi, FAULT_DISCARD)) {
 			f2fs_show_injection_info(sbi, FAULT_DISCARD);
 			err = -EIO;
-			goto submit;
-		}
-		err = __blkdev_issue_discard(bdev,
+		} else {
+			err = __blkdev_issue_discard(bdev,
 					SECTOR_FROM_BLOCK(start),
 					SECTOR_FROM_BLOCK(len),
 					GFP_NOFS, &bio);
-submit:
+		}
 		if (err) {
 			spin_lock_irqsave(&dc->lock, flags);
 			if (dc->state == D_PARTIAL)
-- 
cgit 


From b7ad23cec26a91a4f7c45ff7ff8e915f21ac5127 Mon Sep 17 00:00:00 2001
From: Yuwei Guan <ssawgyw@gmail.com>
Date: Tue, 15 Nov 2022 14:35:35 +0800
Subject: f2fs: fix to alloc_mode changed after remount on a small volume
 device

The commit 84b89e5d943d8 ("f2fs: add auto tuning for small devices") add
tuning for small volume device, now support to tune alloce_mode to 'reuse'
if it's small size. But the alloc_mode will change to 'default' when do
remount on this small size dievce. This patch fo fix alloc_mode changed
when do remount for a small volume device.

Signed-off-by: Yuwei Guan <Yuwei.Guan@zeekrlife.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 75027ff85cd9..96cfe626a670 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -2059,7 +2059,11 @@ static void default_options(struct f2fs_sb_info *sbi)
 		F2FS_OPTION(sbi).active_logs = NR_CURSEG_PERSIST_TYPE;
 
 	F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
-	F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT;
+	if (le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_main) <=
+							SMALL_VOLUME_SEGMENTS)
+		F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE;
+	else
+		F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT;
 	F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX;
 	F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID);
 	F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID);
@@ -4077,7 +4081,6 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi)
 
 	/* adjust parameters according to the volume size */
 	if (sm_i->main_segments <= SMALL_VOLUME_SEGMENTS) {
-		F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE;
 		if (f2fs_block_unit_discard(sbi))
 			sm_i->dcc_info->discard_granularity = 1;
 		sm_i->ipu_policy = 1 << F2FS_IPU_FORCE |
-- 
cgit 


From 777cd95b8066ced4e9a2534941b81f8ad98e74fb Mon Sep 17 00:00:00 2001
From: Yuwei Guan <ssawgyw@gmail.com>
Date: Tue, 15 Nov 2022 14:35:36 +0800
Subject: f2fs: cleanup for 'f2fs_tuning_parameters' function

A cleanup patch for 'f2fs_tuning_parameters' function.

Signed-off-by: Yuwei Guan <Yuwei.Guan@zeekrlife.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 96cfe626a670..05101cd4140b 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -4077,13 +4077,11 @@ static int f2fs_setup_casefold(struct f2fs_sb_info *sbi)
 
 static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi)
 {
-	struct f2fs_sm_info *sm_i = SM_I(sbi);
-
 	/* adjust parameters according to the volume size */
-	if (sm_i->main_segments <= SMALL_VOLUME_SEGMENTS) {
+	if (MAIN_SEGS(sbi) <= SMALL_VOLUME_SEGMENTS) {
 		if (f2fs_block_unit_discard(sbi))
-			sm_i->dcc_info->discard_granularity = 1;
-		sm_i->ipu_policy = 1 << F2FS_IPU_FORCE |
+			SM_I(sbi)->dcc_info->discard_granularity = 1;
+		SM_I(sbi)->ipu_policy = 1 << F2FS_IPU_FORCE |
 					1 << F2FS_IPU_HONOR_OPU_WRITE;
 	}
 
-- 
cgit 


From 66aee5aaa237e5a3475581b462b8b22e0944d264 Mon Sep 17 00:00:00 2001
From: Yuwei Guan <ssawgyw@gmail.com>
Date: Tue, 15 Nov 2022 14:35:37 +0800
Subject: f2fs: change type for 'sbi->readdir_ra'

Before this patch, the varibale 'readdir_ra' takes effect if it's equal
to '1' or not, so we can change type for it from 'int' to 'bool'.

Signed-off-by: Yuwei Guan <Yuwei.Guan@zeekrlife.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/dir.c   | 2 +-
 fs/f2fs/f2fs.h  | 2 +-
 fs/f2fs/super.c | 2 +-
 fs/f2fs/sysfs.c | 5 +++++
 4 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 030b7fd4142f..8e025157f35c 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -1010,7 +1010,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
 	struct fscrypt_str de_name = FSTR_INIT(NULL, 0);
 	struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode);
 	struct blk_plug plug;
-	bool readdir_ra = sbi->readdir_ra == 1;
+	bool readdir_ra = sbi->readdir_ra;
 	bool found_valid_dirent = false;
 	int err = 0;
 
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index b89b5d755ce0..96bd3461c0bb 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1698,7 +1698,7 @@ struct f2fs_sb_info {
 	unsigned int total_node_count;		/* total node block count */
 	unsigned int total_valid_node_count;	/* valid node block count */
 	int dir_level;				/* directory level */
-	int readdir_ra;				/* readahead inode in readdir */
+	bool readdir_ra;			/* readahead inode in readdir */
 	u64 max_io_bytes;			/* max io bytes to merge IOs */
 
 	block_t user_block_count;		/* # of user blocks */
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 05101cd4140b..31435c8645c8 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -4085,7 +4085,7 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi)
 					1 << F2FS_IPU_HONOR_OPU_WRITE;
 	}
 
-	sbi->readdir_ra = 1;
+	sbi->readdir_ra = true;
 }
 
 static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 97bf0dbb0974..33ec467b3772 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -656,6 +656,11 @@ out:
 		return count;
 	}
 
+	if (!strcmp(a->attr.name, "readdir_ra")) {
+		sbi->readdir_ra = !!t;
+		return count;
+	}
+
 	*ui = (unsigned int)t;
 
 	return count;
-- 
cgit 


From 4ff23a6547b81ca22adb852dfe93ee5fc45328ac Mon Sep 17 00:00:00 2001
From: Sheng Yong <shengyong@oppo.com>
Date: Thu, 17 Nov 2022 23:10:54 +0800
Subject: f2fs: set zstd compress level correctly

Fixes: cf30f6a5f0c6 ("lib: zstd: Add kernel-specific API")
Signed-off-by: Sheng Yong <shengyong@oppo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Reviewed-by: Nick Terrell <terrelln@fb.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/compress.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index d315c2de136f..74d3f2d2271f 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -346,7 +346,7 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc)
 	if (!level)
 		level = F2FS_ZSTD_DEFAULT_CLEVEL;
 
-	params = zstd_get_params(F2FS_ZSTD_DEFAULT_CLEVEL, cc->rlen);
+	params = zstd_get_params(level, cc->rlen);
 	workspace_size = zstd_cstream_workspace_bound(&params.cParams);
 
 	workspace = f2fs_kvmalloc(F2FS_I_SB(cc->inode),
-- 
cgit 


From bc12ac98ea2e1b70adc6478c8b473a0003b659d3 Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Wed, 29 Jun 2022 19:26:46 +0800
Subject: ext4: silence the warning when evicting inode with dioread_nolock

When evicting an inode with default dioread_nolock, it could be raced by
the unwritten extents converting kworker after writeback some new
allocated dirty blocks. It convert unwritten extents to written, the
extents could be merged to upper level and free extent blocks, so it
could mark the inode dirty again even this inode has been marked
I_FREEING. But the inode->i_io_list check and warning in
ext4_evict_inode() missing this corner case. Fortunately,
ext4_evict_inode() will wait all extents converting finished before this
check, so it will not lead to inode use-after-free problem, every thing
is OK besides this warning. The WARN_ON_ONCE was originally designed
for finding inode use-after-free issues in advance, but if we add
current dioread_nolock case in, it will become not quite useful, so fix
this warning by just remove this check.

 ======
 WARNING: CPU: 7 PID: 1092 at fs/ext4/inode.c:227
 ext4_evict_inode+0x875/0xc60
 ...
 RIP: 0010:ext4_evict_inode+0x875/0xc60
 ...
 Call Trace:
  <TASK>
  evict+0x11c/0x2b0
  iput+0x236/0x3a0
  do_unlinkat+0x1b4/0x490
  __x64_sys_unlinkat+0x4c/0xb0
  do_syscall_64+0x3b/0x90
  entry_SYSCALL_64_after_hwframe+0x46/0xb0
 RIP: 0033:0x7fa933c1115b
 ======

rm                          kworker
                            ext4_end_io_end()
vfs_unlink()
 ext4_unlink()
                             ext4_convert_unwritten_io_end_vec()
                              ext4_convert_unwritten_extents()
                               ext4_map_blocks()
                                ext4_ext_map_blocks()
                                 ext4_ext_try_to_merge_up()
                                  __mark_inode_dirty()
                                   check !I_FREEING
                                   locked_inode_to_wb_and_lock_list()
 iput()
  iput_final()
   evict()
    ext4_evict_inode()
     truncate_inode_pages_final() //wait release io_end
                                    inode_io_list_move_locked()
                             ext4_release_io_end()
     trigger WARN_ON_ONCE()

Cc: stable@kernel.org
Fixes: ceff86fddae8 ("ext4: Avoid freeing inodes on dirty list")
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20220629112647.4141034-1-yi.zhang@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inode.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2b5ef1b64249..7c5f5dabe0fd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -222,13 +222,13 @@ void ext4_evict_inode(struct inode *inode)
 
 	/*
 	 * For inodes with journalled data, transaction commit could have
-	 * dirtied the inode. Flush worker is ignoring it because of I_FREEING
-	 * flag but we still need to remove the inode from the writeback lists.
+	 * dirtied the inode. And for inodes with dioread_nolock, unwritten
+	 * extents converting worker could merge extents and also have dirtied
+	 * the inode. Flush worker is ignoring it because of I_FREEING flag but
+	 * we still need to remove the inode from the writeback lists.
 	 */
-	if (!list_empty_careful(&inode->i_io_list)) {
-		WARN_ON_ONCE(!ext4_should_journal_data(inode));
+	if (!list_empty_careful(&inode->i_io_list))
 		inode_io_list_del(inode);
-	}
 
 	/*
 	 * Protect us against freezing - iput() caller didn't have to have any
-- 
cgit 


From 787caf1bdcd9f04058e4e8d8ed56db1dbafea0b7 Mon Sep 17 00:00:00 2001
From: Sheng Yong <shengyong@oppo.com>
Date: Fri, 11 Nov 2022 18:08:29 +0800
Subject: f2fs: fix to enable compress for newly created file if extension
 matches

If compress_extension is set, and a newly created file matches the
extension, the file could be marked as compression file. However,
if inline_data is also enabled, there is no chance to check its
extension since f2fs_should_compress() always returns false.

This patch moves set_compress_inode(), which do extension check, in
f2fs_should_compress() to check extensions before setting inline
data flag.

Fixes: 7165841d578e ("f2fs: fix to check inline_data during compressed inode conversion")
Signed-off-by: Sheng Yong <shengyong@oppo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h  |   2 +-
 fs/f2fs/namei.c | 329 ++++++++++++++++++++++++++++----------------------------
 2 files changed, 164 insertions(+), 167 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 96bd3461c0bb..f0833638f59e 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2980,7 +2980,7 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr)
 /* Flags that should be inherited by new inodes from their parent. */
 #define F2FS_FL_INHERITED (F2FS_SYNC_FL | F2FS_NODUMP_FL | F2FS_NOATIME_FL | \
 			   F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \
-			   F2FS_CASEFOLD_FL | F2FS_COMPR_FL | F2FS_NOCOMP_FL)
+			   F2FS_CASEFOLD_FL)
 
 /* Flags that are appropriate for regular files (all but dir-specific ones). */
 #define F2FS_REG_FLMASK		(~(F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index e104409c3a0e..54448dccbb6a 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -22,8 +22,163 @@
 #include "acl.h"
 #include <trace/events/f2fs.h>
 
+static inline int is_extension_exist(const unsigned char *s, const char *sub,
+						bool tmp_ext)
+{
+	size_t slen = strlen(s);
+	size_t sublen = strlen(sub);
+	int i;
+
+	if (sublen == 1 && *sub == '*')
+		return 1;
+
+	/*
+	 * filename format of multimedia file should be defined as:
+	 * "filename + '.' + extension + (optional: '.' + temp extension)".
+	 */
+	if (slen < sublen + 2)
+		return 0;
+
+	if (!tmp_ext) {
+		/* file has no temp extension */
+		if (s[slen - sublen - 1] != '.')
+			return 0;
+		return !strncasecmp(s + slen - sublen, sub, sublen);
+	}
+
+	for (i = 1; i < slen - sublen; i++) {
+		if (s[i] != '.')
+			continue;
+		if (!strncasecmp(s + i + 1, sub, sublen))
+			return 1;
+	}
+
+	return 0;
+}
+
+int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name,
+							bool hot, bool set)
+{
+	__u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
+	int cold_count = le32_to_cpu(sbi->raw_super->extension_count);
+	int hot_count = sbi->raw_super->hot_ext_count;
+	int total_count = cold_count + hot_count;
+	int start, count;
+	int i;
+
+	if (set) {
+		if (total_count == F2FS_MAX_EXTENSION)
+			return -EINVAL;
+	} else {
+		if (!hot && !cold_count)
+			return -EINVAL;
+		if (hot && !hot_count)
+			return -EINVAL;
+	}
+
+	if (hot) {
+		start = cold_count;
+		count = total_count;
+	} else {
+		start = 0;
+		count = cold_count;
+	}
+
+	for (i = start; i < count; i++) {
+		if (strcmp(name, extlist[i]))
+			continue;
+
+		if (set)
+			return -EINVAL;
+
+		memcpy(extlist[i], extlist[i + 1],
+				F2FS_EXTENSION_LEN * (total_count - i - 1));
+		memset(extlist[total_count - 1], 0, F2FS_EXTENSION_LEN);
+		if (hot)
+			sbi->raw_super->hot_ext_count = hot_count - 1;
+		else
+			sbi->raw_super->extension_count =
+						cpu_to_le32(cold_count - 1);
+		return 0;
+	}
+
+	if (!set)
+		return -EINVAL;
+
+	if (hot) {
+		memcpy(extlist[count], name, strlen(name));
+		sbi->raw_super->hot_ext_count = hot_count + 1;
+	} else {
+		char buf[F2FS_MAX_EXTENSION][F2FS_EXTENSION_LEN];
+
+		memcpy(buf, &extlist[cold_count],
+				F2FS_EXTENSION_LEN * hot_count);
+		memset(extlist[cold_count], 0, F2FS_EXTENSION_LEN);
+		memcpy(extlist[cold_count], name, strlen(name));
+		memcpy(&extlist[cold_count + 1], buf,
+				F2FS_EXTENSION_LEN * hot_count);
+		sbi->raw_super->extension_count = cpu_to_le32(cold_count + 1);
+	}
+	return 0;
+}
+
+static void set_compress_new_inode(struct f2fs_sb_info *sbi, struct inode *dir,
+				struct inode *inode, const unsigned char *name)
+{
+	__u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
+	unsigned char (*noext)[F2FS_EXTENSION_LEN] =
+						F2FS_OPTION(sbi).noextensions;
+	unsigned char (*ext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).extensions;
+	unsigned char ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt;
+	unsigned char noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt;
+	int i, cold_count, hot_count;
+
+	if (!f2fs_sb_has_compression(sbi))
+		return;
+
+	if (S_ISDIR(inode->i_mode))
+		goto inherit_comp;
+
+	/* This name comes only from normal files. */
+	if (!name)
+		return;
+
+	/* Don't compress hot files. */
+	f2fs_down_read(&sbi->sb_lock);
+	cold_count = le32_to_cpu(sbi->raw_super->extension_count);
+	hot_count = sbi->raw_super->hot_ext_count;
+	for (i = cold_count; i < cold_count + hot_count; i++)
+		if (is_extension_exist(name, extlist[i], false))
+			break;
+	f2fs_up_read(&sbi->sb_lock);
+	if (i < (cold_count + hot_count))
+		return;
+
+	/* Don't compress unallowed extension. */
+	for (i = 0; i < noext_cnt; i++)
+		if (is_extension_exist(name, noext[i], false))
+			return;
+
+	/* Compress wanting extension. */
+	for (i = 0; i < ext_cnt; i++) {
+		if (is_extension_exist(name, ext[i], false)) {
+			set_compress_context(inode);
+			return;
+		}
+	}
+inherit_comp:
+	/* Inherit the {no-}compression flag in directory */
+	if (F2FS_I(dir)->i_flags & F2FS_NOCOMP_FL) {
+		F2FS_I(inode)->i_flags |= F2FS_NOCOMP_FL;
+		f2fs_mark_inode_dirty_sync(inode, true);
+	} else if (F2FS_I(dir)->i_flags & F2FS_COMPR_FL) {
+		set_compress_context(inode);
+	}
+}
+
 static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
-						struct inode *dir, umode_t mode)
+						struct inode *dir, umode_t mode,
+						const char *name)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
 	nid_t ino;
@@ -114,12 +269,8 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
 	if (F2FS_I(inode)->i_flags & F2FS_PROJINHERIT_FL)
 		set_inode_flag(inode, FI_PROJ_INHERIT);
 
-	if (f2fs_sb_has_compression(sbi)) {
-		/* Inherit the compression flag in directory */
-		if ((F2FS_I(dir)->i_flags & F2FS_COMPR_FL) &&
-					f2fs_may_compress(inode))
-			set_compress_context(inode);
-	}
+	/* Check compression first. */
+	set_compress_new_inode(sbi, dir, inode, name);
 
 	/* Should enable inline_data after compression set */
 	if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
@@ -153,40 +304,6 @@ fail_drop:
 	return ERR_PTR(err);
 }
 
-static inline int is_extension_exist(const unsigned char *s, const char *sub,
-						bool tmp_ext)
-{
-	size_t slen = strlen(s);
-	size_t sublen = strlen(sub);
-	int i;
-
-	if (sublen == 1 && *sub == '*')
-		return 1;
-
-	/*
-	 * filename format of multimedia file should be defined as:
-	 * "filename + '.' + extension + (optional: '.' + temp extension)".
-	 */
-	if (slen < sublen + 2)
-		return 0;
-
-	if (!tmp_ext) {
-		/* file has no temp extension */
-		if (s[slen - sublen - 1] != '.')
-			return 0;
-		return !strncasecmp(s + slen - sublen, sub, sublen);
-	}
-
-	for (i = 1; i < slen - sublen; i++) {
-		if (s[i] != '.')
-			continue;
-		if (!strncasecmp(s + i + 1, sub, sublen))
-			return 1;
-	}
-
-	return 0;
-}
-
 /*
  * Set file's temperature for hot/cold data separation
  */
@@ -217,124 +334,6 @@ static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *
 		file_set_hot(inode);
 }
 
-int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name,
-							bool hot, bool set)
-{
-	__u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
-	int cold_count = le32_to_cpu(sbi->raw_super->extension_count);
-	int hot_count = sbi->raw_super->hot_ext_count;
-	int total_count = cold_count + hot_count;
-	int start, count;
-	int i;
-
-	if (set) {
-		if (total_count == F2FS_MAX_EXTENSION)
-			return -EINVAL;
-	} else {
-		if (!hot && !cold_count)
-			return -EINVAL;
-		if (hot && !hot_count)
-			return -EINVAL;
-	}
-
-	if (hot) {
-		start = cold_count;
-		count = total_count;
-	} else {
-		start = 0;
-		count = cold_count;
-	}
-
-	for (i = start; i < count; i++) {
-		if (strcmp(name, extlist[i]))
-			continue;
-
-		if (set)
-			return -EINVAL;
-
-		memcpy(extlist[i], extlist[i + 1],
-				F2FS_EXTENSION_LEN * (total_count - i - 1));
-		memset(extlist[total_count - 1], 0, F2FS_EXTENSION_LEN);
-		if (hot)
-			sbi->raw_super->hot_ext_count = hot_count - 1;
-		else
-			sbi->raw_super->extension_count =
-						cpu_to_le32(cold_count - 1);
-		return 0;
-	}
-
-	if (!set)
-		return -EINVAL;
-
-	if (hot) {
-		memcpy(extlist[count], name, strlen(name));
-		sbi->raw_super->hot_ext_count = hot_count + 1;
-	} else {
-		char buf[F2FS_MAX_EXTENSION][F2FS_EXTENSION_LEN];
-
-		memcpy(buf, &extlist[cold_count],
-				F2FS_EXTENSION_LEN * hot_count);
-		memset(extlist[cold_count], 0, F2FS_EXTENSION_LEN);
-		memcpy(extlist[cold_count], name, strlen(name));
-		memcpy(&extlist[cold_count + 1], buf,
-				F2FS_EXTENSION_LEN * hot_count);
-		sbi->raw_super->extension_count = cpu_to_le32(cold_count + 1);
-	}
-	return 0;
-}
-
-static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode,
-						const unsigned char *name)
-{
-	__u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
-	unsigned char (*noext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).noextensions;
-	unsigned char (*ext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).extensions;
-	unsigned char ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt;
-	unsigned char noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt;
-	int i, cold_count, hot_count;
-
-	if (!f2fs_sb_has_compression(sbi) ||
-			F2FS_I(inode)->i_flags & F2FS_NOCOMP_FL ||
-			!f2fs_may_compress(inode) ||
-			(!ext_cnt && !noext_cnt))
-		return;
-
-	f2fs_down_read(&sbi->sb_lock);
-
-	cold_count = le32_to_cpu(sbi->raw_super->extension_count);
-	hot_count = sbi->raw_super->hot_ext_count;
-
-	for (i = cold_count; i < cold_count + hot_count; i++) {
-		if (is_extension_exist(name, extlist[i], false)) {
-			f2fs_up_read(&sbi->sb_lock);
-			return;
-		}
-	}
-
-	f2fs_up_read(&sbi->sb_lock);
-
-	for (i = 0; i < noext_cnt; i++) {
-		if (is_extension_exist(name, noext[i], false)) {
-			f2fs_disable_compressed_file(inode);
-			return;
-		}
-	}
-
-	if (is_inode_flag_set(inode, FI_COMPRESSED_FILE))
-		return;
-
-	for (i = 0; i < ext_cnt; i++) {
-		if (!is_extension_exist(name, ext[i], false))
-			continue;
-
-		/* Do not use inline_data with compression */
-		stat_dec_inline_inode(inode);
-		clear_inode_flag(inode, FI_INLINE_DATA);
-		set_compress_context(inode);
-		return;
-	}
-}
-
 static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir,
 		       struct dentry *dentry, umode_t mode, bool excl)
 {
@@ -352,15 +351,13 @@ static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir,
 	if (err)
 		return err;
 
-	inode = f2fs_new_inode(mnt_userns, dir, mode);
+	inode = f2fs_new_inode(mnt_userns, dir, mode, dentry->d_name.name);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 
 	if (!test_opt(sbi, DISABLE_EXT_IDENTIFY))
 		set_file_temperature(sbi, inode, dentry->d_name.name);
 
-	set_compress_inode(sbi, inode, dentry->d_name.name);
-
 	inode->i_op = &f2fs_file_inode_operations;
 	inode->i_fop = &f2fs_file_operations;
 	inode->i_mapping->a_ops = &f2fs_dblock_aops;
@@ -689,7 +686,7 @@ static int f2fs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
 	if (err)
 		return err;
 
-	inode = f2fs_new_inode(mnt_userns, dir, S_IFLNK | S_IRWXUGO);
+	inode = f2fs_new_inode(mnt_userns, dir, S_IFLNK | S_IRWXUGO, NULL);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 
@@ -760,7 +757,7 @@ static int f2fs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
 	if (err)
 		return err;
 
-	inode = f2fs_new_inode(mnt_userns, dir, S_IFDIR | mode);
+	inode = f2fs_new_inode(mnt_userns, dir, S_IFDIR | mode, NULL);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 
@@ -817,7 +814,7 @@ static int f2fs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
 	if (err)
 		return err;
 
-	inode = f2fs_new_inode(mnt_userns, dir, mode);
+	inode = f2fs_new_inode(mnt_userns, dir, mode, NULL);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 
@@ -856,7 +853,7 @@ static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
 	if (err)
 		return err;
 
-	inode = f2fs_new_inode(mnt_userns, dir, mode);
+	inode = f2fs_new_inode(mnt_userns, dir, mode, NULL);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 
-- 
cgit 


From b16bcaaf7a325f90967259a0b7cfcce4ff8c56ba Mon Sep 17 00:00:00 2001
From: Sheng Yong <shengyong@oppo.com>
Date: Fri, 11 Nov 2022 18:08:30 +0800
Subject: f2fs: move set_file_temperature into f2fs_new_inode

Since the file name has already passed to f2fs_new_inode(), let's
move set_file_temperature() into f2fs_new_inode().

Signed-off-by: Sheng Yong <shengyong@oppo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/namei.c | 62 +++++++++++++++++++++++++++------------------------------
 1 file changed, 29 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 54448dccbb6a..58a91ce8fe08 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -176,6 +176,32 @@ inherit_comp:
 	}
 }
 
+/*
+ * Set file's temperature for hot/cold data separation
+ */
+static void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode,
+		const unsigned char *name)
+{
+	__u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
+	int i, cold_count, hot_count;
+
+	f2fs_down_read(&sbi->sb_lock);
+	cold_count = le32_to_cpu(sbi->raw_super->extension_count);
+	hot_count = sbi->raw_super->hot_ext_count;
+	for (i = 0; i < cold_count + hot_count; i++)
+		if (is_extension_exist(name, extlist[i], true))
+			break;
+	f2fs_up_read(&sbi->sb_lock);
+
+	if (i == cold_count + hot_count)
+		return;
+
+	if (i < cold_count)
+		file_set_cold(inode);
+	else
+		file_set_hot(inode);
+}
+
 static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
 						struct inode *dir, umode_t mode,
 						const char *name)
@@ -276,6 +302,9 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
 	if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
 		set_inode_flag(inode, FI_INLINE_DATA);
 
+	if (name && !test_opt(sbi, DISABLE_EXT_IDENTIFY))
+		set_file_temperature(sbi, inode, name);
+
 	stat_inc_inline_xattr(inode);
 	stat_inc_inline_inode(inode);
 	stat_inc_inline_dir(inode);
@@ -304,36 +333,6 @@ fail_drop:
 	return ERR_PTR(err);
 }
 
-/*
- * Set file's temperature for hot/cold data separation
- */
-static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode,
-		const unsigned char *name)
-{
-	__u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
-	int i, cold_count, hot_count;
-
-	f2fs_down_read(&sbi->sb_lock);
-
-	cold_count = le32_to_cpu(sbi->raw_super->extension_count);
-	hot_count = sbi->raw_super->hot_ext_count;
-
-	for (i = 0; i < cold_count + hot_count; i++) {
-		if (is_extension_exist(name, extlist[i], true))
-			break;
-	}
-
-	f2fs_up_read(&sbi->sb_lock);
-
-	if (i == cold_count + hot_count)
-		return;
-
-	if (i < cold_count)
-		file_set_cold(inode);
-	else
-		file_set_hot(inode);
-}
-
 static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir,
 		       struct dentry *dentry, umode_t mode, bool excl)
 {
@@ -355,9 +354,6 @@ static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir,
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 
-	if (!test_opt(sbi, DISABLE_EXT_IDENTIFY))
-		set_file_temperature(sbi, inode, dentry->d_name.name);
-
 	inode->i_op = &f2fs_file_inode_operations;
 	inode->i_fop = &f2fs_file_operations;
 	inode->i_mapping->a_ops = &f2fs_dblock_aops;
-- 
cgit 


From 620816393239890feff8608251e2746b1cc2cfa0 Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Thu, 17 Nov 2022 01:10:45 +0800
Subject: f2fs: make __queue_discard_cmd() return void

Since __queue_discard_cmd() never returns an error,
let's make it return void.

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 8b0b76550578..14ece4bf7c7e 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1358,13 +1358,13 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi,
 	}
 }
 
-static int __queue_discard_cmd(struct f2fs_sb_info *sbi,
+static void __queue_discard_cmd(struct f2fs_sb_info *sbi,
 		struct block_device *bdev, block_t blkstart, block_t blklen)
 {
 	block_t lblkstart = blkstart;
 
 	if (!f2fs_bdev_support_discard(bdev))
-		return 0;
+		return;
 
 	trace_f2fs_queue_discard(bdev, blkstart, blklen);
 
@@ -1376,7 +1376,6 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi,
 	mutex_lock(&SM_I(sbi)->dcc_info->cmd_lock);
 	__update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen);
 	mutex_unlock(&SM_I(sbi)->dcc_info->cmd_lock);
-	return 0;
 }
 
 static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi,
@@ -1776,7 +1775,8 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
 	}
 
 	/* For conventional zones, use regular discard if supported */
-	return __queue_discard_cmd(sbi, bdev, lblkstart, blklen);
+	__queue_discard_cmd(sbi, bdev, lblkstart, blklen);
+	return 0;
 }
 #endif
 
@@ -1787,7 +1787,8 @@ static int __issue_discard_async(struct f2fs_sb_info *sbi,
 	if (f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(bdev))
 		return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen);
 #endif
-	return __queue_discard_cmd(sbi, bdev, blkstart, blklen);
+	__queue_discard_cmd(sbi, bdev, blkstart, blklen);
+	return 0;
 }
 
 static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
-- 
cgit 


From 78a99fe6254cad4be310cd84af39f6c46b668c72 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Wed, 23 Nov 2022 06:42:52 +0800
Subject: f2fs: truncate blocks in batch in __complete_revoke_list()

Use f2fs_do_truncate_blocks() to truncate all blocks in-batch in
__complete_revoke_list().

Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 14ece4bf7c7e..37c721e1eb03 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -262,24 +262,19 @@ static void __complete_revoke_list(struct inode *inode, struct list_head *head,
 					bool revoke)
 {
 	struct revoke_entry *cur, *tmp;
-	pgoff_t start_index = 0;
 	bool truncate = is_inode_flag_set(inode, FI_ATOMIC_REPLACE);
 
 	list_for_each_entry_safe(cur, tmp, head, list) {
-		if (revoke) {
+		if (revoke)
 			__replace_atomic_write_block(inode, cur->index,
 						cur->old_addr, NULL, true);
-		} else if (truncate) {
-			f2fs_truncate_hole(inode, start_index, cur->index);
-			start_index = cur->index + 1;
-		}
 
 		list_del(&cur->list);
 		kmem_cache_free(revoke_entry_slab, cur);
 	}
 
 	if (!revoke && truncate)
-		f2fs_do_truncate_blocks(inode, start_index * PAGE_SIZE, false);
+		f2fs_do_truncate_blocks(inode, 0, false);
 }
 
 static int __f2fs_commit_atomic_write(struct inode *inode)
-- 
cgit 


From e219aecfd4b766c4e878a3769057e9809f7fcadc Mon Sep 17 00:00:00 2001
From: Yonggil Song <yonggil.song@samsung.com>
Date: Tue, 22 Nov 2022 18:03:20 +0900
Subject: f2fs: avoid victim selection from previous victim section

When f2fs chooses GC victim in large section & LFS mode,
next_victim_seg[gc_type] is referenced first. After segment is freed,
next_victim_seg[gc_type] has the next segment number.
However, next_victim_seg[gc_type] still has the last segment number
even after the last segment of section is freed. In this case, when f2fs
chooses a victim for the next GC round, the last segment of previous victim
section is chosen as a victim.

Initialize next_victim_seg[gc_type] to NULL_SEGNO for the last segment in
large section.

Fixes: e3080b0120a1 ("f2fs: support subsectional garbage collection")
Signed-off-by: Yonggil Song <yonggil.song@samsung.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/gc.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 0f967b1e98f2..f1b68eda2235 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1749,8 +1749,9 @@ freed:
 				get_valid_blocks(sbi, segno, false) == 0)
 			seg_freed++;
 
-		if (__is_large_section(sbi) && segno + 1 < end_segno)
-			sbi->next_victim_seg[gc_type] = segno + 1;
+		if (__is_large_section(sbi))
+			sbi->next_victim_seg[gc_type] =
+				(segno + 1 < end_segno) ? segno + 1 : NULL_SEGNO;
 skip:
 		f2fs_put_page(sum_page, 0);
 	}
-- 
cgit 


From 48c08c51f938d955dd8f5b8972bc29faa4c9556f Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Fri, 18 Nov 2022 11:46:00 +0800
Subject: f2fs: init discard policy after thread wakeup

Under the current logic, after the discard thread wakes up, it will not
run according to the expected policy, but will use the expected policy
before sleep. Move the strategy selection to after the thread wakes up,
so that the running state of the thread meets expectations.

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 37c721e1eb03..73ad8dc9a4d3 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1679,6 +1679,11 @@ static int issue_discard_thread(void *data)
 	set_freezable();
 
 	do {
+		wait_event_interruptible_timeout(*q,
+				kthread_should_stop() || freezing(current) ||
+				dcc->discard_wake,
+				msecs_to_jiffies(wait_ms));
+
 		if (sbi->gc_mode == GC_URGENT_HIGH ||
 			!f2fs_available_free_memory(sbi, DISCARD_CACHE))
 			__init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1);
@@ -1686,14 +1691,6 @@ static int issue_discard_thread(void *data)
 			__init_discard_policy(sbi, &dpolicy, DPOLICY_BG,
 						dcc->discard_granularity);
 
-		if (!atomic_read(&dcc->discard_cmd_cnt))
-		       wait_ms = dpolicy.max_interval;
-
-		wait_event_interruptible_timeout(*q,
-				kthread_should_stop() || freezing(current) ||
-				dcc->discard_wake,
-				msecs_to_jiffies(wait_ms));
-
 		if (dcc->discard_wake)
 			dcc->discard_wake = 0;
 
@@ -1707,12 +1704,11 @@ static int issue_discard_thread(void *data)
 			continue;
 		if (kthread_should_stop())
 			return 0;
-		if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) {
+		if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) ||
+			!atomic_read(&dcc->discard_cmd_cnt)) {
 			wait_ms = dpolicy.max_interval;
 			continue;
 		}
-		if (!atomic_read(&dcc->discard_cmd_cnt))
-			continue;
 
 		sb_start_intwrite(sbi->sb);
 
@@ -1727,6 +1723,8 @@ static int issue_discard_thread(void *data)
 		} else {
 			wait_ms = dpolicy.max_interval;
 		}
+		if (!atomic_read(&dcc->discard_cmd_cnt))
+			wait_ms = dpolicy.max_interval;
 
 		sb_end_intwrite(sbi->sb);
 
-- 
cgit 


From 1cd2e6d544359ae13e6fd9029b6018b957cf08c3 Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Thu, 24 Nov 2022 00:44:01 +0800
Subject: f2fs: define MIN_DISCARD_GRANULARITY macro

Do cleanup in f2fs_tuning_parameters() and __init_discard_policy(),
let's use macro instead of number.

Suggested-by: Chao Yu <chao@kernel.org>
Signed-off-by: Yangtao Li <frank.li@vivo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h    | 2 ++
 fs/f2fs/segment.c | 4 ++--
 fs/f2fs/super.c   | 3 ++-
 3 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index f0833638f59e..4694b55b6df4 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -329,6 +329,8 @@ struct discard_entry {
 	unsigned char discard_map[SIT_VBLOCK_MAP_SIZE];	/* segment discard bitmap */
 };
 
+/* minimum discard granularity, unit: block count */
+#define MIN_DISCARD_GRANULARITY		1
 /* default discard granularity of inner discard thread, unit: block count */
 #define DEFAULT_DISCARD_GRANULARITY		16
 /* default maximum discard granularity of ordered discard, unit: block count */
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 73ad8dc9a4d3..c7afcc6cd75b 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1065,7 +1065,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
 		dpolicy->sync = false;
 		dpolicy->ordered = true;
 		if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) {
-			dpolicy->granularity = 1;
+			dpolicy->granularity = MIN_DISCARD_GRANULARITY;
 			if (atomic_read(&dcc->discard_cmd_cnt))
 				dpolicy->max_interval =
 					dcc->min_discard_issue_time;
@@ -1080,7 +1080,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
 	} else if (discard_type == DPOLICY_UMOUNT) {
 		dpolicy->io_aware = false;
 		/* we need to issue all to keep CP_TRIMMED_FLAG */
-		dpolicy->granularity = 1;
+		dpolicy->granularity = MIN_DISCARD_GRANULARITY;
 		dpolicy->timeout = true;
 	}
 }
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 31435c8645c8..daf14b55a972 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -4080,7 +4080,8 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi)
 	/* adjust parameters according to the volume size */
 	if (MAIN_SEGS(sbi) <= SMALL_VOLUME_SEGMENTS) {
 		if (f2fs_block_unit_discard(sbi))
-			SM_I(sbi)->dcc_info->discard_granularity = 1;
+			SM_I(sbi)->dcc_info->discard_granularity =
+						MIN_DISCARD_GRANULARITY;
 		SM_I(sbi)->ipu_policy = 1 << F2FS_IPU_FORCE |
 					1 << F2FS_IPU_HONOR_OPU_WRITE;
 	}
-- 
cgit 


From 8a47d228de6a4fd4c751142fb27d56f385b3fe41 Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Thu, 24 Nov 2022 00:44:02 +0800
Subject: f2fs: introduce discard_urgent_util sysfs node

Through this node, you can control the background discard
to run more aggressively or not aggressively when reach the
utilization rate of the space.

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h    | 1 +
 fs/f2fs/segment.c | 3 ++-
 fs/f2fs/sysfs.c   | 9 +++++++++
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 4694b55b6df4..296683648d4f 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -413,6 +413,7 @@ struct discard_cmd_control {
 	unsigned int min_discard_issue_time;	/* min. interval between discard issue */
 	unsigned int mid_discard_issue_time;	/* mid. interval between discard issue */
 	unsigned int max_discard_issue_time;	/* max. interval between discard issue */
+	unsigned int discard_urgent_util;	/* utilization which issue discard proactively */
 	unsigned int discard_granularity;	/* discard granularity */
 	unsigned int max_ordered_discard;	/* maximum discard granularity issued by lba order */
 	unsigned int undiscard_blks;		/* # of undiscard blocks */
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index c7afcc6cd75b..0ff451ea18f6 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1064,7 +1064,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
 		dpolicy->io_aware = true;
 		dpolicy->sync = false;
 		dpolicy->ordered = true;
-		if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) {
+		if (utilization(sbi) > dcc->discard_urgent_util) {
 			dpolicy->granularity = MIN_DISCARD_GRANULARITY;
 			if (atomic_read(&dcc->discard_cmd_cnt))
 				dpolicy->max_interval =
@@ -2079,6 +2079,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
 	dcc->min_discard_issue_time = DEF_MIN_DISCARD_ISSUE_TIME;
 	dcc->mid_discard_issue_time = DEF_MID_DISCARD_ISSUE_TIME;
 	dcc->max_discard_issue_time = DEF_MAX_DISCARD_ISSUE_TIME;
+	dcc->discard_urgent_util = DEF_DISCARD_URGENT_UTIL;
 	dcc->undiscard_blks = 0;
 	dcc->next_pos = 0;
 	dcc->root = RB_ROOT_CACHED;
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 33ec467b3772..a4745d596310 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -493,6 +493,13 @@ out:
 		return count;
 	}
 
+	if (!strcmp(a->attr.name, "discard_urgent_util")) {
+		if (t > 100)
+			return -EINVAL;
+		*ui = t;
+		return count;
+	}
+
 	if (!strcmp(a->attr.name, "migration_granularity")) {
 		if (t == 0 || t > sbi->segs_per_sec)
 			return -EINVAL;
@@ -800,6 +807,7 @@ F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_request, max_discard_req
 F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, min_discard_issue_time, min_discard_issue_time);
 F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, mid_discard_issue_time, mid_discard_issue_time);
 F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_issue_time, max_discard_issue_time);
+F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_urgent_util, discard_urgent_util);
 F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity);
 F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_ordered_discard, max_ordered_discard);
 F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks);
@@ -930,6 +938,7 @@ static struct attribute *f2fs_attrs[] = {
 	ATTR_LIST(min_discard_issue_time),
 	ATTR_LIST(mid_discard_issue_time),
 	ATTR_LIST(max_discard_issue_time),
+	ATTR_LIST(discard_urgent_util),
 	ATTR_LIST(discard_granularity),
 	ATTR_LIST(max_ordered_discard),
 	ATTR_LIST(pending_discard),
-- 
cgit 


From f43dc4dc3eff028b5ddddd99f3a66c5a6bdd4e78 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 29 Nov 2022 09:09:11 +1100
Subject: iomap: buffered write failure should not truncate the page cache

iomap_file_buffered_write_punch_delalloc() currently invalidates the
page cache over the unused range of the delalloc extent that was
allocated. While the write allocated the delalloc extent, it does
not own it exclusively as the write does not hold any locks that
prevent either writeback or mmap page faults from changing the state
of either the page cache or the extent state backing this range.

Whilst xfs_bmap_punch_delalloc_range() already handles races in
extent conversion - it will only punch out delalloc extents and it
ignores any other type of extent - the page cache truncate does not
discriminate between data written by this write or some other task.
As a result, truncating the page cache can result in data corruption
if the write races with mmap modifications to the file over the same
range.

generic/346 exercises this workload, and if we randomly fail writes
(as will happen when iomap gets stale iomap detection later in the
patchset), it will randomly corrupt the file data because it removes
data written by mmap() in the same page as the write() that failed.

Hence we do not want to punch out the page cache over the range of
the extent we failed to write to - what we actually need to do is
detect the ranges that have dirty data in cache over them and *not
punch them out*.

To do this, we have to walk the page cache over the range of the
delalloc extent we want to remove. This is made complex by the fact
we have to handle partially up-to-date folios correctly and this can
happen even when the FSB size == PAGE_SIZE because we now support
multi-page folios in the page cache.

Because we are only interested in discovering the edges of data
ranges in the page cache (i.e. hole-data boundaries) we can make use
of mapping_seek_hole_data() to find those transitions in the page
cache. As we hold the invalidate_lock, we know that the boundaries
are not going to change while we walk the range. This interface is
also byte-based and is sub-page block aware, so we can find the data
ranges in the cache based on byte offsets rather than page, folio or
fs block sized chunks. This greatly simplifies the logic of finding
dirty cached ranges in the page cache.

Once we've identified a range that contains cached data, we can then
iterate the range folio by folio. This allows us to determine if the
data is dirty and hence perform the correct delalloc extent punching
operations. The seek interface we use to iterate data ranges will
give us sub-folio start/end granularity, so we may end up looking up
the same folio multiple times as the seek interface iterates across
each discontiguous data region in the folio.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/iomap/buffered-io.c | 195 +++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 180 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 734b761a1e4a..dca9ec9dc4a8 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -832,6 +832,165 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
 }
 EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
 
+/*
+ * Scan the data range passed to us for dirty page cache folios. If we find a
+ * dirty folio, punch out the preceeding range and update the offset from which
+ * the next punch will start from.
+ *
+ * We can punch out storage reservations under clean pages because they either
+ * contain data that has been written back - in which case the delalloc punch
+ * over that range is a no-op - or they have been read faults in which case they
+ * contain zeroes and we can remove the delalloc backing range and any new
+ * writes to those pages will do the normal hole filling operation...
+ *
+ * This makes the logic simple: we only need to keep the delalloc extents only
+ * over the dirty ranges of the page cache.
+ *
+ * This function uses [start_byte, end_byte) intervals (i.e. open ended) to
+ * simplify range iterations.
+ */
+static int iomap_write_delalloc_scan(struct inode *inode,
+		loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
+		int (*punch)(struct inode *inode, loff_t offset, loff_t length))
+{
+	while (start_byte < end_byte) {
+		struct folio	*folio;
+
+		/* grab locked page */
+		folio = filemap_lock_folio(inode->i_mapping,
+				start_byte >> PAGE_SHIFT);
+		if (!folio) {
+			start_byte = ALIGN_DOWN(start_byte, PAGE_SIZE) +
+					PAGE_SIZE;
+			continue;
+		}
+
+		/* if dirty, punch up to offset */
+		if (folio_test_dirty(folio)) {
+			if (start_byte > *punch_start_byte) {
+				int	error;
+
+				error = punch(inode, *punch_start_byte,
+						start_byte - *punch_start_byte);
+				if (error) {
+					folio_unlock(folio);
+					folio_put(folio);
+					return error;
+				}
+			}
+
+			/*
+			 * Make sure the next punch start is correctly bound to
+			 * the end of this data range, not the end of the folio.
+			 */
+			*punch_start_byte = min_t(loff_t, end_byte,
+					folio_next_index(folio) << PAGE_SHIFT);
+		}
+
+		/* move offset to start of next folio in range */
+		start_byte = folio_next_index(folio) << PAGE_SHIFT;
+		folio_unlock(folio);
+		folio_put(folio);
+	}
+	return 0;
+}
+
+/*
+ * Punch out all the delalloc blocks in the range given except for those that
+ * have dirty data still pending in the page cache - those are going to be
+ * written and so must still retain the delalloc backing for writeback.
+ *
+ * As we are scanning the page cache for data, we don't need to reimplement the
+ * wheel - mapping_seek_hole_data() does exactly what we need to identify the
+ * start and end of data ranges correctly even for sub-folio block sizes. This
+ * byte range based iteration is especially convenient because it means we
+ * don't have to care about variable size folios, nor where the start or end of
+ * the data range lies within a folio, if they lie within the same folio or even
+ * if there are multiple discontiguous data ranges within the folio.
+ *
+ * It should be noted that mapping_seek_hole_data() is not aware of EOF, and so
+ * can return data ranges that exist in the cache beyond EOF. e.g. a page fault
+ * spanning EOF will initialise the post-EOF data to zeroes and mark it up to
+ * date. A write page fault can then mark it dirty. If we then fail a write()
+ * beyond EOF into that up to date cached range, we allocate a delalloc block
+ * beyond EOF and then have to punch it out. Because the range is up to date,
+ * mapping_seek_hole_data() will return it, and we will skip the punch because
+ * the folio is dirty. THis is incorrect - we always need to punch out delalloc
+ * beyond EOF in this case as writeback will never write back and covert that
+ * delalloc block beyond EOF. Hence we limit the cached data scan range to EOF,
+ * resulting in always punching out the range from the EOF to the end of the
+ * range the iomap spans.
+ *
+ * Intervals are of the form [start_byte, end_byte) (i.e. open ended) because it
+ * matches the intervals returned by mapping_seek_hole_data(). i.e. SEEK_DATA
+ * returns the start of a data range (start_byte), and SEEK_HOLE(start_byte)
+ * returns the end of the data range (data_end). Using closed intervals would
+ * require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose
+ * the code to subtle off-by-one bugs....
+ */
+static int iomap_write_delalloc_release(struct inode *inode,
+		loff_t start_byte, loff_t end_byte,
+		int (*punch)(struct inode *inode, loff_t pos, loff_t length))
+{
+	loff_t punch_start_byte = start_byte;
+	loff_t scan_end_byte = min(i_size_read(inode), end_byte);
+	int error = 0;
+
+	/*
+	 * Lock the mapping to avoid races with page faults re-instantiating
+	 * folios and dirtying them via ->page_mkwrite whilst we walk the
+	 * cache and perform delalloc extent removal. Failing to do this can
+	 * leave dirty pages with no space reservation in the cache.
+	 */
+	filemap_invalidate_lock(inode->i_mapping);
+	while (start_byte < scan_end_byte) {
+		loff_t		data_end;
+
+		start_byte = mapping_seek_hole_data(inode->i_mapping,
+				start_byte, scan_end_byte, SEEK_DATA);
+		/*
+		 * If there is no more data to scan, all that is left is to
+		 * punch out the remaining range.
+		 */
+		if (start_byte == -ENXIO || start_byte == scan_end_byte)
+			break;
+		if (start_byte < 0) {
+			error = start_byte;
+			goto out_unlock;
+		}
+		WARN_ON_ONCE(start_byte < punch_start_byte);
+		WARN_ON_ONCE(start_byte > scan_end_byte);
+
+		/*
+		 * We find the end of this contiguous cached data range by
+		 * seeking from start_byte to the beginning of the next hole.
+		 */
+		data_end = mapping_seek_hole_data(inode->i_mapping, start_byte,
+				scan_end_byte, SEEK_HOLE);
+		if (data_end < 0) {
+			error = data_end;
+			goto out_unlock;
+		}
+		WARN_ON_ONCE(data_end <= start_byte);
+		WARN_ON_ONCE(data_end > scan_end_byte);
+
+		error = iomap_write_delalloc_scan(inode, &punch_start_byte,
+				start_byte, data_end, punch);
+		if (error)
+			goto out_unlock;
+
+		/* The next data search starts at the end of this one. */
+		start_byte = data_end;
+	}
+
+	if (punch_start_byte < end_byte)
+		error = punch(inode, punch_start_byte,
+				end_byte - punch_start_byte);
+out_unlock:
+	filemap_invalidate_unlock(inode->i_mapping);
+	return error;
+}
+
 /*
  * When a short write occurs, the filesystem may need to remove reserved space
  * that was allocated in ->iomap_begin from it's ->iomap_end method. For
@@ -842,8 +1001,25 @@ EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
  * allocated for this iomap.
  *
  * This function uses [start_byte, end_byte) intervals (i.e. open ended) to
- * simplify range iterations, but converts them back to {offset,len} tuples for
- * the punch callback.
+ * simplify range iterations.
+ *
+ * The punch() callback *must* only punch delalloc extents in the range passed
+ * to it. It must skip over all other types of extents in the range and leave
+ * them completely unchanged. It must do this punch atomically with respect to
+ * other extent modifications.
+ *
+ * The punch() callback may be called with a folio locked to prevent writeback
+ * extent allocation racing at the edge of the range we are currently punching.
+ * The locked folio may or may not cover the range being punched, so it is not
+ * safe for the punch() callback to lock folios itself.
+ *
+ * Lock order is:
+ *
+ * inode->i_rwsem (shared or exclusive)
+ *   inode->i_mapping->invalidate_lock (exclusive)
+ *     folio_lock()
+ *       ->punch
+ *         internal filesystem allocation lock
  */
 int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
 		struct iomap *iomap, loff_t pos, loff_t length,
@@ -853,7 +1029,6 @@ int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
 	loff_t			start_byte;
 	loff_t			end_byte;
 	int			blocksize = i_blocksize(inode);
-	int			error = 0;
 
 	if (iomap->type != IOMAP_DELALLOC)
 		return 0;
@@ -877,18 +1052,8 @@ int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
 	if (start_byte >= end_byte)
 		return 0;
 
-	/*
-	 * Lock the mapping to avoid races with page faults re-instantiating
-	 * folios and dirtying them via ->page_mkwrite between the page cache
-	 * truncation and the delalloc extent removal. Failing to do this can
-	 * leave dirty pages with no space reservation in the cache.
-	 */
-	filemap_invalidate_lock(inode->i_mapping);
-	truncate_pagecache_range(inode, start_byte, end_byte - 1);
-	error = punch(inode, start_byte, end_byte - start_byte);
-	filemap_invalidate_unlock(inode->i_mapping);
-
-	return error;
+	return iomap_write_delalloc_release(inode, start_byte, end_byte,
+					punch);
 }
 EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc);
 
-- 
cgit 


From 7348b322332d8602a4133f0b861334ea021b134a Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 29 Nov 2022 09:09:17 +1100
Subject: xfs: xfs_bmap_punch_delalloc_range() should take a byte range

All the callers of xfs_bmap_punch_delalloc_range() jump through
hoops to convert a byte range to filesystem blocks before calling
xfs_bmap_punch_delalloc_range(). Instead, pass the byte range to
xfs_bmap_punch_delalloc_range() and have it do the conversion to
filesystem blocks internally.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_aops.c      | 16 ++++++----------
 fs/xfs/xfs_bmap_util.c | 10 ++++++----
 fs/xfs/xfs_bmap_util.h |  2 +-
 fs/xfs/xfs_iomap.c     |  8 ++------
 4 files changed, 15 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 5d1a995b15f8..6aadc5815068 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -114,9 +114,8 @@ xfs_end_ioend(
 	if (unlikely(error)) {
 		if (ioend->io_flags & IOMAP_F_SHARED) {
 			xfs_reflink_cancel_cow_range(ip, offset, size, true);
-			xfs_bmap_punch_delalloc_range(ip,
-						      XFS_B_TO_FSBT(mp, offset),
-						      XFS_B_TO_FSB(mp, size));
+			xfs_bmap_punch_delalloc_range(ip, offset,
+					offset + size);
 		}
 		goto done;
 	}
@@ -455,12 +454,8 @@ xfs_discard_folio(
 	struct folio		*folio,
 	loff_t			pos)
 {
-	struct inode		*inode = folio->mapping->host;
-	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_inode	*ip = XFS_I(folio->mapping->host);
 	struct xfs_mount	*mp = ip->i_mount;
-	size_t			offset = offset_in_folio(folio, pos);
-	xfs_fileoff_t		start_fsb = XFS_B_TO_FSBT(mp, pos);
-	xfs_fileoff_t		pageoff_fsb = XFS_B_TO_FSBT(mp, offset);
 	int			error;
 
 	if (xfs_is_shutdown(mp))
@@ -470,8 +465,9 @@ xfs_discard_folio(
 		"page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
 			folio, ip->i_ino, pos);
 
-	error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
-			i_blocks_per_folio(inode, folio) - pageoff_fsb);
+	error = xfs_bmap_punch_delalloc_range(ip, pos,
+			round_up(pos, folio_size(folio)));
+
 	if (error && !xfs_is_shutdown(mp))
 		xfs_alert(mp, "page discard unable to remove delalloc mapping.");
 }
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 04d0c2bff67c..867645b74d88 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -590,11 +590,13 @@ out_unlock_iolock:
 int
 xfs_bmap_punch_delalloc_range(
 	struct xfs_inode	*ip,
-	xfs_fileoff_t		start_fsb,
-	xfs_fileoff_t		length)
+	xfs_off_t		start_byte,
+	xfs_off_t		end_byte)
 {
+	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_ifork	*ifp = &ip->i_df;
-	xfs_fileoff_t		end_fsb = start_fsb + length;
+	xfs_fileoff_t		start_fsb = XFS_B_TO_FSBT(mp, start_byte);
+	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, end_byte);
 	struct xfs_bmbt_irec	got, del;
 	struct xfs_iext_cursor	icur;
 	int			error = 0;
@@ -607,7 +609,7 @@ xfs_bmap_punch_delalloc_range(
 
 	while (got.br_startoff + got.br_blockcount > start_fsb) {
 		del = got;
-		xfs_trim_extent(&del, start_fsb, length);
+		xfs_trim_extent(&del, start_fsb, end_fsb - start_fsb);
 
 		/*
 		 * A delete can push the cursor forward. Step back to the
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 24b37d211f1d..6888078f5c31 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -31,7 +31,7 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap)
 #endif /* CONFIG_XFS_RT */
 
 int	xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
-		xfs_fileoff_t start_fsb, xfs_fileoff_t length);
+		xfs_off_t start_byte, xfs_off_t end_byte);
 
 struct kgetbmap {
 	__s64		bmv_offset;	/* file offset of segment in blocks */
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index ea96e8a34868..09676ff6940e 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1126,12 +1126,8 @@ xfs_buffered_write_delalloc_punch(
 	loff_t			offset,
 	loff_t			length)
 {
-	struct xfs_mount	*mp = XFS_M(inode->i_sb);
-	xfs_fileoff_t		start_fsb = XFS_B_TO_FSBT(mp, offset);
-	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + length);
-
-	return xfs_bmap_punch_delalloc_range(XFS_I(inode), start_fsb,
-				end_fsb - start_fsb);
+	return xfs_bmap_punch_delalloc_range(XFS_I(inode), offset,
+			offset + length);
 }
 
 static int
-- 
cgit 


From d7b64041164ca177170191d2ad775da074ab2926 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 29 Nov 2022 09:09:17 +1100
Subject: iomap: write iomap validity checks

A recent multithreaded write data corruption has been uncovered in
the iomap write code. The core of the problem is partial folio
writes can be flushed to disk while a new racing write can map it
and fill the rest of the page:

writeback			new write

allocate blocks
  blocks are unwritten
submit IO
.....
				map blocks
				iomap indicates UNWRITTEN range
				loop {
				  lock folio
				  copyin data
.....
IO completes
  runs unwritten extent conv
    blocks are marked written
				  <iomap now stale>
				  get next folio
				}

Now add memory pressure such that memory reclaim evicts the
partially written folio that has already been written to disk.

When the new write finally gets to the last partial page of the new
write, it does not find it in cache, so it instantiates a new page,
sees the iomap is unwritten, and zeros the part of the page that
it does not have data from. This overwrites the data on disk that
was originally written.

The full description of the corruption mechanism can be found here:

https://lore.kernel.org/linux-xfs/20220817093627.GZ3600936@dread.disaster.area/

To solve this problem, we need to check whether the iomap is still
valid after we lock each folio during the write. We have to do it
after we lock the page so that we don't end up with state changes
occurring while we wait for the folio to be locked.

Hence we need a mechanism to be able to check that the cached iomap
is still valid (similar to what we already do in buffered
writeback), and we need a way for ->begin_write to back out and
tell the high level iomap iterator that we need to remap the
remaining write range.

The iomap needs to grow some storage for the validity cookie that
the filesystem provides to travel with the iomap. XFS, in
particular, also needs to know some more information about what the
iomap maps (attribute extents rather than file data extents) to for
the validity cookie to cover all the types of iomaps we might need
to validate.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/iomap/buffered-io.c | 29 ++++++++++++++++++++++++++++-
 fs/iomap/iter.c        | 19 ++++++++++++++++++-
 2 files changed, 46 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index dca9ec9dc4a8..356193e44cf0 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -584,7 +584,7 @@ static int iomap_write_begin_inline(const struct iomap_iter *iter,
 	return iomap_read_inline_data(iter, folio);
 }
 
-static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
+static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
 		size_t len, struct folio **foliop)
 {
 	const struct iomap_page_ops *page_ops = iter->iomap.page_ops;
@@ -618,6 +618,27 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
 		status = (iter->flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOMEM;
 		goto out_no_page;
 	}
+
+	/*
+	 * Now we have a locked folio, before we do anything with it we need to
+	 * check that the iomap we have cached is not stale. The inode extent
+	 * mapping can change due to concurrent IO in flight (e.g.
+	 * IOMAP_UNWRITTEN state can change and memory reclaim could have
+	 * reclaimed a previously partially written page at this index after IO
+	 * completion before this write reaches this file offset) and hence we
+	 * could do the wrong thing here (zero a page range incorrectly or fail
+	 * to zero) and corrupt data.
+	 */
+	if (page_ops && page_ops->iomap_valid) {
+		bool iomap_valid = page_ops->iomap_valid(iter->inode,
+							&iter->iomap);
+		if (!iomap_valid) {
+			iter->iomap.flags |= IOMAP_F_STALE;
+			status = 0;
+			goto out_unlock;
+		}
+	}
+
 	if (pos + len > folio_pos(folio) + folio_size(folio))
 		len = folio_pos(folio) + folio_size(folio) - pos;
 
@@ -773,6 +794,8 @@ again:
 		status = iomap_write_begin(iter, pos, bytes, &folio);
 		if (unlikely(status))
 			break;
+		if (iter->iomap.flags & IOMAP_F_STALE)
+			break;
 
 		page = folio_file_page(folio, pos >> PAGE_SHIFT);
 		if (mapping_writably_mapped(mapping))
@@ -1081,6 +1104,8 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
 		status = iomap_write_begin(iter, pos, bytes, &folio);
 		if (unlikely(status))
 			return status;
+		if (iter->iomap.flags & IOMAP_F_STALE)
+			break;
 
 		status = iomap_write_end(iter, pos, bytes, bytes, folio);
 		if (WARN_ON_ONCE(status == 0))
@@ -1136,6 +1161,8 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
 		status = iomap_write_begin(iter, pos, bytes, &folio);
 		if (status)
 			return status;
+		if (iter->iomap.flags & IOMAP_F_STALE)
+			break;
 
 		offset = offset_in_folio(folio, pos);
 		if (bytes > folio_size(folio) - offset)
diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c
index a1c7592d2ade..79a0614eaab7 100644
--- a/fs/iomap/iter.c
+++ b/fs/iomap/iter.c
@@ -7,12 +7,28 @@
 #include <linux/iomap.h>
 #include "trace.h"
 
+/*
+ * Advance to the next range we need to map.
+ *
+ * If the iomap is marked IOMAP_F_STALE, it means the existing map was not fully
+ * processed - it was aborted because the extent the iomap spanned may have been
+ * changed during the operation. In this case, the iteration behaviour is to
+ * remap the unprocessed range of the iter, and that means we may need to remap
+ * even when we've made no progress (i.e. iter->processed = 0). Hence the
+ * "finished iterating" case needs to distinguish between
+ * (processed = 0) meaning we are done and (processed = 0 && stale) meaning we
+ * need to remap the entire remaining range.
+ */
 static inline int iomap_iter_advance(struct iomap_iter *iter)
 {
+	bool stale = iter->iomap.flags & IOMAP_F_STALE;
+
 	/* handle the previous iteration (if any) */
 	if (iter->iomap.length) {
-		if (iter->processed <= 0)
+		if (iter->processed < 0)
 			return iter->processed;
+		if (!iter->processed && !stale)
+			return 0;
 		if (WARN_ON_ONCE(iter->processed > iomap_length(iter)))
 			return -EIO;
 		iter->pos += iter->processed;
@@ -33,6 +49,7 @@ static inline void iomap_iter_done(struct iomap_iter *iter)
 	WARN_ON_ONCE(iter->iomap.offset > iter->pos);
 	WARN_ON_ONCE(iter->iomap.length == 0);
 	WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos);
+	WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_STALE);
 
 	trace_iomap_iter_dstmap(iter->inode, &iter->iomap);
 	if (iter->srcmap.type != IOMAP_HOLE)
-- 
cgit 


From 304a68b9c63bbfc1f6e159d68e8892fc54a06067 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 29 Nov 2022 09:09:17 +1100
Subject: xfs: use iomap_valid method to detect stale cached iomaps

Now that iomap supports a mechanism to validate cached iomaps for
buffered write operations, hook it up to the XFS buffered write ops
so that we can avoid data corruptions that result from stale cached
iomaps. See:

https://lore.kernel.org/linux-xfs/20220817093627.GZ3600936@dread.disaster.area/

or the ->iomap_valid() introduction commit for exact details of the
corruption vector.

The validity cookie we store in the iomap is based on the type of
iomap we return. It is expected that the iomap->flags we set in
xfs_bmbt_to_iomap() is not perturbed by the iomap core and are
returned to us in the iomap passed via the .iomap_valid() callback.
This ensures that the validity cookie is always checking the correct
inode fork sequence numbers to detect potential changes that affect
the extent cached by the iomap.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_bmap.c |  6 ++-
 fs/xfs/xfs_aops.c        |  2 +-
 fs/xfs/xfs_iomap.c       | 95 ++++++++++++++++++++++++++++++++++++++----------
 fs/xfs/xfs_iomap.h       |  5 ++-
 fs/xfs/xfs_pnfs.c        |  6 ++-
 5 files changed, 87 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 49d0d4ea63fc..56b9b7db38bb 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -4551,7 +4551,8 @@ xfs_bmapi_convert_delalloc(
 	 * the extent.  Just return the real extent at this offset.
 	 */
 	if (!isnullstartblock(bma.got.br_startblock)) {
-		xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags);
+		xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags,
+				xfs_iomap_inode_sequence(ip, flags));
 		*seq = READ_ONCE(ifp->if_seq);
 		goto out_trans_cancel;
 	}
@@ -4599,7 +4600,8 @@ xfs_bmapi_convert_delalloc(
 	XFS_STATS_INC(mp, xs_xstrat_quick);
 
 	ASSERT(!isnullstartblock(bma.got.br_startblock));
-	xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags);
+	xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags,
+				xfs_iomap_inode_sequence(ip, flags));
 	*seq = READ_ONCE(ifp->if_seq);
 
 	if (whichfork == XFS_COW_FORK)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 6aadc5815068..a22d90af40c8 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -372,7 +372,7 @@ retry:
 	    isnullstartblock(imap.br_startblock))
 		goto allocate_blocks;
 
-	xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0);
+	xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, XFS_WPC(wpc)->data_seq);
 	trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
 	return 0;
 allocate_blocks:
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 09676ff6940e..26ca3cc1a048 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -48,13 +48,45 @@ xfs_alert_fsblock_zero(
 	return -EFSCORRUPTED;
 }
 
+u64
+xfs_iomap_inode_sequence(
+	struct xfs_inode	*ip,
+	u16			iomap_flags)
+{
+	u64			cookie = 0;
+
+	if (iomap_flags & IOMAP_F_XATTR)
+		return READ_ONCE(ip->i_af.if_seq);
+	if ((iomap_flags & IOMAP_F_SHARED) && ip->i_cowfp)
+		cookie = (u64)READ_ONCE(ip->i_cowfp->if_seq) << 32;
+	return cookie | READ_ONCE(ip->i_df.if_seq);
+}
+
+/*
+ * Check that the iomap passed to us is still valid for the given offset and
+ * length.
+ */
+static bool
+xfs_iomap_valid(
+	struct inode		*inode,
+	const struct iomap	*iomap)
+{
+	return iomap->validity_cookie ==
+			xfs_iomap_inode_sequence(XFS_I(inode), iomap->flags);
+}
+
+const struct iomap_page_ops xfs_iomap_page_ops = {
+	.iomap_valid		= xfs_iomap_valid,
+};
+
 int
 xfs_bmbt_to_iomap(
 	struct xfs_inode	*ip,
 	struct iomap		*iomap,
 	struct xfs_bmbt_irec	*imap,
 	unsigned int		mapping_flags,
-	u16			iomap_flags)
+	u16			iomap_flags,
+	u64			sequence_cookie)
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
@@ -91,6 +123,9 @@ xfs_bmbt_to_iomap(
 	if (xfs_ipincount(ip) &&
 	    (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
 		iomap->flags |= IOMAP_F_DIRTY;
+
+	iomap->validity_cookie = sequence_cookie;
+	iomap->page_ops = &xfs_iomap_page_ops;
 	return 0;
 }
 
@@ -195,7 +230,8 @@ xfs_iomap_write_direct(
 	xfs_fileoff_t		offset_fsb,
 	xfs_fileoff_t		count_fsb,
 	unsigned int		flags,
-	struct xfs_bmbt_irec	*imap)
+	struct xfs_bmbt_irec	*imap,
+	u64			*seq)
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_trans	*tp;
@@ -285,6 +321,7 @@ xfs_iomap_write_direct(
 		error = xfs_alert_fsblock_zero(ip, imap);
 
 out_unlock:
+	*seq = xfs_iomap_inode_sequence(ip, 0);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return error;
 
@@ -743,6 +780,7 @@ xfs_direct_write_iomap_begin(
 	bool			shared = false;
 	u16			iomap_flags = 0;
 	unsigned int		lockmode = XFS_ILOCK_SHARED;
+	u64			seq;
 
 	ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO));
 
@@ -811,9 +849,10 @@ xfs_direct_write_iomap_begin(
 			goto out_unlock;
 	}
 
+	seq = xfs_iomap_inode_sequence(ip, iomap_flags);
 	xfs_iunlock(ip, lockmode);
 	trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
-	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags);
+	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags, seq);
 
 allocate_blocks:
 	error = -EAGAIN;
@@ -839,24 +878,26 @@ allocate_blocks:
 	xfs_iunlock(ip, lockmode);
 
 	error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb,
-			flags, &imap);
+			flags, &imap, &seq);
 	if (error)
 		return error;
 
 	trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
 	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags,
-				 iomap_flags | IOMAP_F_NEW);
+				 iomap_flags | IOMAP_F_NEW, seq);
 
 out_found_cow:
-	xfs_iunlock(ip, lockmode);
 	length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
 	trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
 	if (imap.br_startblock != HOLESTARTBLOCK) {
-		error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
+		seq = xfs_iomap_inode_sequence(ip, 0);
+		error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq);
 		if (error)
-			return error;
+			goto out_unlock;
 	}
-	return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED);
+	seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
+	xfs_iunlock(ip, lockmode);
+	return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq);
 
 out_unlock:
 	if (lockmode)
@@ -915,6 +956,7 @@ xfs_buffered_write_iomap_begin(
 	int			allocfork = XFS_DATA_FORK;
 	int			error = 0;
 	unsigned int		lockmode = XFS_ILOCK_EXCL;
+	u64			seq;
 
 	if (xfs_is_shutdown(mp))
 		return -EIO;
@@ -1094,26 +1136,31 @@ retry:
 	 * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
 	 * them out if the write happens to fail.
 	 */
+	seq = xfs_iomap_inode_sequence(ip, IOMAP_F_NEW);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap);
-	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW);
+	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq);
 
 found_imap:
+	seq = xfs_iomap_inode_sequence(ip, 0);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
+	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq);
 
 found_cow:
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	seq = xfs_iomap_inode_sequence(ip, 0);
 	if (imap.br_startoff <= offset_fsb) {
-		error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
+		error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq);
 		if (error)
-			return error;
+			goto out_unlock;
+		seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 		return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
-					 IOMAP_F_SHARED);
+					 IOMAP_F_SHARED, seq);
 	}
 
 	xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb);
-	return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq);
 
 out_unlock:
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -1193,6 +1240,7 @@ xfs_read_iomap_begin(
 	int			nimaps = 1, error = 0;
 	bool			shared = false;
 	unsigned int		lockmode = XFS_ILOCK_SHARED;
+	u64			seq;
 
 	ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO)));
 
@@ -1206,13 +1254,14 @@ xfs_read_iomap_begin(
 			       &nimaps, 0);
 	if (!error && (flags & IOMAP_REPORT))
 		error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
+	seq = xfs_iomap_inode_sequence(ip, shared ? IOMAP_F_SHARED : 0);
 	xfs_iunlock(ip, lockmode);
 
 	if (error)
 		return error;
 	trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
 	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags,
-				 shared ? IOMAP_F_SHARED : 0);
+				 shared ? IOMAP_F_SHARED : 0, seq);
 }
 
 const struct iomap_ops xfs_read_iomap_ops = {
@@ -1237,6 +1286,7 @@ xfs_seek_iomap_begin(
 	struct xfs_bmbt_irec	imap, cmap;
 	int			error = 0;
 	unsigned		lockmode;
+	u64			seq;
 
 	if (xfs_is_shutdown(mp))
 		return -EIO;
@@ -1271,8 +1321,9 @@ xfs_seek_iomap_begin(
 		if (data_fsb < cow_fsb + cmap.br_blockcount)
 			end_fsb = min(end_fsb, data_fsb);
 		xfs_trim_extent(&cmap, offset_fsb, end_fsb);
+		seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
 		error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
-					  IOMAP_F_SHARED);
+				IOMAP_F_SHARED, seq);
 		/*
 		 * This is a COW extent, so we must probe the page cache
 		 * because there could be dirty page cache being backed
@@ -1293,8 +1344,9 @@ xfs_seek_iomap_begin(
 	imap.br_startblock = HOLESTARTBLOCK;
 	imap.br_state = XFS_EXT_NORM;
 done:
+	seq = xfs_iomap_inode_sequence(ip, 0);
 	xfs_trim_extent(&imap, offset_fsb, end_fsb);
-	error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
+	error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq);
 out_unlock:
 	xfs_iunlock(ip, lockmode);
 	return error;
@@ -1320,6 +1372,7 @@ xfs_xattr_iomap_begin(
 	struct xfs_bmbt_irec	imap;
 	int			nimaps = 1, error = 0;
 	unsigned		lockmode;
+	int			seq;
 
 	if (xfs_is_shutdown(mp))
 		return -EIO;
@@ -1336,12 +1389,14 @@ xfs_xattr_iomap_begin(
 	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
 			       &nimaps, XFS_BMAPI_ATTRFORK);
 out_unlock:
+
+	seq = xfs_iomap_inode_sequence(ip, IOMAP_F_XATTR);
 	xfs_iunlock(ip, lockmode);
 
 	if (error)
 		return error;
 	ASSERT(nimaps);
-	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
+	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_XATTR, seq);
 }
 
 const struct iomap_ops xfs_xattr_iomap_ops = {
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 0f62ab633040..4da13440bae9 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -13,14 +13,15 @@ struct xfs_bmbt_irec;
 
 int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb,
 		xfs_fileoff_t count_fsb, unsigned int flags,
-		struct xfs_bmbt_irec *imap);
+		struct xfs_bmbt_irec *imap, u64 *sequence);
 int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
 xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip,
 		xfs_fileoff_t end_fsb);
 
+u64 xfs_iomap_inode_sequence(struct xfs_inode *ip, u16 iomap_flags);
 int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap,
 		struct xfs_bmbt_irec *imap, unsigned int mapping_flags,
-		u16 iomap_flags);
+		u16 iomap_flags, u64 sequence_cookie);
 
 int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len,
 		bool *did_zero);
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 37a24f0f7cd4..38d23f0e703a 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -125,6 +125,7 @@ xfs_fs_map_blocks(
 	int			nimaps = 1;
 	uint			lock_flags;
 	int			error = 0;
+	u64			seq;
 
 	if (xfs_is_shutdown(mp))
 		return -EIO;
@@ -176,6 +177,7 @@ xfs_fs_map_blocks(
 	lock_flags = xfs_ilock_data_map_shared(ip);
 	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
 				&imap, &nimaps, bmapi_flags);
+	seq = xfs_iomap_inode_sequence(ip, 0);
 
 	ASSERT(!nimaps || imap.br_startblock != DELAYSTARTBLOCK);
 
@@ -189,7 +191,7 @@ xfs_fs_map_blocks(
 		xfs_iunlock(ip, lock_flags);
 
 		error = xfs_iomap_write_direct(ip, offset_fsb,
-				end_fsb - offset_fsb, 0, &imap);
+				end_fsb - offset_fsb, 0, &imap, &seq);
 		if (error)
 			goto out_unlock;
 
@@ -209,7 +211,7 @@ xfs_fs_map_blocks(
 	}
 	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 
-	error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0);
+	error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0, seq);
 	*device_generation = mp->m_generation;
 	return error;
 out_unlock:
-- 
cgit 


From 6e8af15ccdc4e138a5b529c1901a0013e1dcaa09 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 29 Nov 2022 09:09:17 +1100
Subject: xfs: drop write error injection is unfixable, remove it

With the changes to scan the page cache for dirty data to avoid data
corruptions from partial write cleanup racing with other page cache
operations, the drop writes error injection no longer works the same
way it used to and causes xfs/196 to fail. This is because xfs/196
writes to the file and populates the page cache before it turns on
the error injection and starts failing -overwrites-.

The result is that the original drop-writes code failed writes only
-after- overwriting the data in the cache, followed by invalidates
the cached data, then punching out the delalloc extent from under
that data.

On the surface, this looks fine. The problem is that page cache
invalidation *doesn't guarantee that it removes anything from the
page cache* and it doesn't change the dirty state of the folio. When
block size == page size and we do page aligned IO (as xfs/196 does)
everything happens to align perfectly and page cache invalidation
removes the single page folios that span the written data. Hence the
followup delalloc punch pass does not find cached data over that
range and it can punch the extent out.

IOWs, xfs/196 "works" for block size == page size with the new
code. I say "works", because it actually only works for the case
where IO is page aligned, and no data was read from disk before
writes occur. Because the moment we actually read data first, the
readahead code allocates multipage folios and suddenly the
invalidate code goes back to zeroing subfolio ranges without
changing dirty state.

Hence, with multipage folios in play, block size == page size is
functionally identical to block size < page size behaviour, and
drop-writes is manifestly broken w.r.t to this case. Invalidation of
a subfolio range doesn't result in the folio being removed from the
cache, just the range gets zeroed. Hence after we've sequentially
walked over a folio that we've dirtied (via write data) and then
invalidated, we end up with a dirty folio full of zeroed data.

And because the new code skips punching ranges that have dirty
folios covering them, we end up leaving the delalloc range intact
after failing all the writes. Hence failed writes now end up
writing zeroes to disk in the cases where invalidation zeroes folios
rather than removing them from cache.

This is a fundamental change of behaviour that is needed to avoid
the data corruption vectors that exist in the old write fail path,
and it renders the drop-writes injection non-functional and
unworkable as it stands.

As it is, I think the error injection is also now unnecessary, as
partial writes that need delalloc extent are going to be a lot more
common with stale iomap detection in place. Hence this patch removes
the drop-writes error injection completely. xfs/196 can remain for
testing kernels that don't have this data corruption fix, but those
that do will report:

xfs/196 3s ... [not run] XFS error injection drop_writes unknown on this kernel.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_errortag.h | 12 +++++-------
 fs/xfs/xfs_error.c           | 27 ++++++++++++++++++++-------
 fs/xfs/xfs_iomap.c           |  9 ---------
 3 files changed, 25 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index 5362908164b0..580ccbd5aadc 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -40,13 +40,12 @@
 #define XFS_ERRTAG_REFCOUNT_FINISH_ONE			25
 #define XFS_ERRTAG_BMAP_FINISH_ONE			26
 #define XFS_ERRTAG_AG_RESV_CRITICAL			27
+
 /*
- * DEBUG mode instrumentation to test and/or trigger delayed allocation
- * block killing in the event of failed writes. When enabled, all
- * buffered writes are silenty dropped and handled as if they failed.
- * All delalloc blocks in the range of the write (including pre-existing
- * delalloc blocks!) are tossed as part of the write failure error
- * handling sequence.
+ * Drop-writes support removed because write error handling cannot trash
+ * pre-existing delalloc extents in any useful way anymore. We retain the
+ * definition so that we can reject it as an invalid value in
+ * xfs_errortag_valid().
  */
 #define XFS_ERRTAG_DROP_WRITES				28
 #define XFS_ERRTAG_LOG_BAD_CRC				29
@@ -95,7 +94,6 @@
 #define XFS_RANDOM_REFCOUNT_FINISH_ONE			1
 #define XFS_RANDOM_BMAP_FINISH_ONE			1
 #define XFS_RANDOM_AG_RESV_CRITICAL			4
-#define XFS_RANDOM_DROP_WRITES				1
 #define XFS_RANDOM_LOG_BAD_CRC				1
 #define XFS_RANDOM_LOG_ITEM_PIN				1
 #define XFS_RANDOM_BUF_LRU_REF				2
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index c6b2aabd6f18..dea3c0649d2f 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -46,7 +46,7 @@ static unsigned int xfs_errortag_random_default[] = {
 	XFS_RANDOM_REFCOUNT_FINISH_ONE,
 	XFS_RANDOM_BMAP_FINISH_ONE,
 	XFS_RANDOM_AG_RESV_CRITICAL,
-	XFS_RANDOM_DROP_WRITES,
+	0, /* XFS_RANDOM_DROP_WRITES has been removed */
 	XFS_RANDOM_LOG_BAD_CRC,
 	XFS_RANDOM_LOG_ITEM_PIN,
 	XFS_RANDOM_BUF_LRU_REF,
@@ -162,7 +162,6 @@ XFS_ERRORTAG_ATTR_RW(refcount_continue_update,	XFS_ERRTAG_REFCOUNT_CONTINUE_UPDA
 XFS_ERRORTAG_ATTR_RW(refcount_finish_one,	XFS_ERRTAG_REFCOUNT_FINISH_ONE);
 XFS_ERRORTAG_ATTR_RW(bmap_finish_one,	XFS_ERRTAG_BMAP_FINISH_ONE);
 XFS_ERRORTAG_ATTR_RW(ag_resv_critical,	XFS_ERRTAG_AG_RESV_CRITICAL);
-XFS_ERRORTAG_ATTR_RW(drop_writes,	XFS_ERRTAG_DROP_WRITES);
 XFS_ERRORTAG_ATTR_RW(log_bad_crc,	XFS_ERRTAG_LOG_BAD_CRC);
 XFS_ERRORTAG_ATTR_RW(log_item_pin,	XFS_ERRTAG_LOG_ITEM_PIN);
 XFS_ERRORTAG_ATTR_RW(buf_lru_ref,	XFS_ERRTAG_BUF_LRU_REF);
@@ -206,7 +205,6 @@ static struct attribute *xfs_errortag_attrs[] = {
 	XFS_ERRORTAG_ATTR_LIST(refcount_finish_one),
 	XFS_ERRORTAG_ATTR_LIST(bmap_finish_one),
 	XFS_ERRORTAG_ATTR_LIST(ag_resv_critical),
-	XFS_ERRORTAG_ATTR_LIST(drop_writes),
 	XFS_ERRORTAG_ATTR_LIST(log_bad_crc),
 	XFS_ERRORTAG_ATTR_LIST(log_item_pin),
 	XFS_ERRORTAG_ATTR_LIST(buf_lru_ref),
@@ -256,6 +254,19 @@ xfs_errortag_del(
 	kmem_free(mp->m_errortag);
 }
 
+static bool
+xfs_errortag_valid(
+	unsigned int		error_tag)
+{
+	if (error_tag >= XFS_ERRTAG_MAX)
+		return false;
+
+	/* Error out removed injection types */
+	if (error_tag == XFS_ERRTAG_DROP_WRITES)
+		return false;
+	return true;
+}
+
 bool
 xfs_errortag_test(
 	struct xfs_mount	*mp,
@@ -277,7 +288,9 @@ xfs_errortag_test(
 	if (!mp->m_errortag)
 		return false;
 
-	ASSERT(error_tag < XFS_ERRTAG_MAX);
+	if (!xfs_errortag_valid(error_tag))
+		return false;
+
 	randfactor = mp->m_errortag[error_tag];
 	if (!randfactor || prandom_u32_max(randfactor))
 		return false;
@@ -293,7 +306,7 @@ xfs_errortag_get(
 	struct xfs_mount	*mp,
 	unsigned int		error_tag)
 {
-	if (error_tag >= XFS_ERRTAG_MAX)
+	if (!xfs_errortag_valid(error_tag))
 		return -EINVAL;
 
 	return mp->m_errortag[error_tag];
@@ -305,7 +318,7 @@ xfs_errortag_set(
 	unsigned int		error_tag,
 	unsigned int		tag_value)
 {
-	if (error_tag >= XFS_ERRTAG_MAX)
+	if (!xfs_errortag_valid(error_tag))
 		return -EINVAL;
 
 	mp->m_errortag[error_tag] = tag_value;
@@ -319,7 +332,7 @@ xfs_errortag_add(
 {
 	BUILD_BUG_ON(ARRAY_SIZE(xfs_errortag_random_default) != XFS_ERRTAG_MAX);
 
-	if (error_tag >= XFS_ERRTAG_MAX)
+	if (!xfs_errortag_valid(error_tag))
 		return -EINVAL;
 
 	return xfs_errortag_set(mp, error_tag,
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 26ca3cc1a048..1bdd7afc1010 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1190,15 +1190,6 @@ xfs_buffered_write_iomap_end(
 	struct xfs_mount	*mp = XFS_M(inode->i_sb);
 	int			error;
 
-	/*
-	 * Behave as if the write failed if drop writes is enabled. Set the NEW
-	 * flag to force delalloc cleanup.
-	 */
-	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DROP_WRITES)) {
-		iomap->flags |= IOMAP_F_NEW;
-		written = 0;
-	}
-
 	error = iomap_file_buffered_write_punch_delalloc(inode, iomap, offset,
 			length, written, &xfs_buffered_write_delalloc_punch);
 	if (error && !xfs_is_shutdown(mp)) {
-- 
cgit 


From c2beff99eb03866df6fdbd3a93b08fd27eb8bf5c Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 28 Nov 2022 17:24:35 -0800
Subject: xfs: add debug knob to slow down writeback for fun

Add a new error injection knob so that we can arbitrarily slow down
writeback to test for race conditions and aberrant reclaim behavior if
the writeback mechanisms are slow to issue writeback.  This will enable
functional testing for the ifork sequence counters introduced in commit
745b3f76d1c8 ("xfs: maintain a sequence count for inode fork
manipulations").

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/libxfs/xfs_errortag.h |  4 +++-
 fs/xfs/xfs_aops.c            | 14 ++++++++++++--
 fs/xfs/xfs_error.c           | 16 ++++++++++++++++
 fs/xfs/xfs_error.h           | 13 +++++++++++++
 fs/xfs/xfs_trace.c           |  2 ++
 fs/xfs/xfs_trace.h           | 44 ++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 90 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index 580ccbd5aadc..f5f629174eca 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -61,7 +61,8 @@
 #define XFS_ERRTAG_LARP					39
 #define XFS_ERRTAG_DA_LEAF_SPLIT			40
 #define XFS_ERRTAG_ATTR_LEAF_TO_NODE			41
-#define XFS_ERRTAG_MAX					42
+#define XFS_ERRTAG_WB_DELAY_MS				42
+#define XFS_ERRTAG_MAX					43
 
 /*
  * Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -107,5 +108,6 @@
 #define XFS_RANDOM_LARP					1
 #define XFS_RANDOM_DA_LEAF_SPLIT			1
 #define XFS_RANDOM_ATTR_LEAF_TO_NODE			1
+#define XFS_RANDOM_WB_DELAY_MS				3000
 
 #endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index a22d90af40c8..41734202796f 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -17,6 +17,8 @@
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
 #include "xfs_reflink.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
 
 struct xfs_writepage_ctx {
 	struct iomap_writepage_ctx ctx;
@@ -217,11 +219,17 @@ xfs_imap_valid(
 	 * checked (and found nothing at this offset) could have added
 	 * overlapping blocks.
 	 */
-	if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq))
+	if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq)) {
+		trace_xfs_wb_data_iomap_invalid(ip, &wpc->iomap,
+				XFS_WPC(wpc)->data_seq, XFS_DATA_FORK);
 		return false;
+	}
 	if (xfs_inode_has_cow_data(ip) &&
-	    XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq))
+	    XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) {
+		trace_xfs_wb_cow_iomap_invalid(ip, &wpc->iomap,
+				XFS_WPC(wpc)->cow_seq, XFS_COW_FORK);
 		return false;
+	}
 	return true;
 }
 
@@ -285,6 +293,8 @@ xfs_map_blocks(
 	if (xfs_is_shutdown(mp))
 		return -EIO;
 
+	XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS);
+
 	/*
 	 * COW fork blocks can overlap data fork blocks even if the blocks
 	 * aren't shared.  COW I/O always takes precedent, so we must always
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index dea3c0649d2f..2d6e3c718e03 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -60,6 +60,7 @@ static unsigned int xfs_errortag_random_default[] = {
 	XFS_RANDOM_LARP,
 	XFS_RANDOM_DA_LEAF_SPLIT,
 	XFS_RANDOM_ATTR_LEAF_TO_NODE,
+	XFS_RANDOM_WB_DELAY_MS,
 };
 
 struct xfs_errortag_attr {
@@ -175,6 +176,7 @@ XFS_ERRORTAG_ATTR_RW(ag_resv_fail, XFS_ERRTAG_AG_RESV_FAIL);
 XFS_ERRORTAG_ATTR_RW(larp,		XFS_ERRTAG_LARP);
 XFS_ERRORTAG_ATTR_RW(da_leaf_split,	XFS_ERRTAG_DA_LEAF_SPLIT);
 XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node,	XFS_ERRTAG_ATTR_LEAF_TO_NODE);
+XFS_ERRORTAG_ATTR_RW(wb_delay_ms,	XFS_ERRTAG_WB_DELAY_MS);
 
 static struct attribute *xfs_errortag_attrs[] = {
 	XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -218,6 +220,7 @@ static struct attribute *xfs_errortag_attrs[] = {
 	XFS_ERRORTAG_ATTR_LIST(larp),
 	XFS_ERRORTAG_ATTR_LIST(da_leaf_split),
 	XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node),
+	XFS_ERRORTAG_ATTR_LIST(wb_delay_ms),
 	NULL,
 };
 ATTRIBUTE_GROUPS(xfs_errortag);
@@ -267,6 +270,19 @@ xfs_errortag_valid(
 	return true;
 }
 
+bool
+xfs_errortag_enabled(
+	struct xfs_mount	*mp,
+	unsigned int		tag)
+{
+	if (!mp->m_errortag)
+		return false;
+	if (!xfs_errortag_valid(tag))
+		return false;
+
+	return mp->m_errortag[tag] != 0;
+}
+
 bool
 xfs_errortag_test(
 	struct xfs_mount	*mp,
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 5191e9145e55..dbe6c37dc697 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -45,6 +45,18 @@ extern bool xfs_errortag_test(struct xfs_mount *mp, const char *expression,
 		const char *file, int line, unsigned int error_tag);
 #define XFS_TEST_ERROR(expr, mp, tag)		\
 	((expr) || xfs_errortag_test((mp), #expr, __FILE__, __LINE__, (tag)))
+bool xfs_errortag_enabled(struct xfs_mount *mp, unsigned int tag);
+#define XFS_ERRORTAG_DELAY(mp, tag)		\
+	do { \
+		might_sleep(); \
+		if (!xfs_errortag_enabled((mp), (tag))) \
+			break; \
+		xfs_warn_ratelimited((mp), \
+"Injecting %ums delay at file %s, line %d, on filesystem \"%s\"", \
+				(mp)->m_errortag[(tag)], __FILE__, __LINE__, \
+				(mp)->m_super->s_id); \
+		mdelay((mp)->m_errortag[(tag)]); \
+	} while (0)
 
 extern int xfs_errortag_get(struct xfs_mount *mp, unsigned int error_tag);
 extern int xfs_errortag_set(struct xfs_mount *mp, unsigned int error_tag,
@@ -55,6 +67,7 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp);
 #define xfs_errortag_init(mp)			(0)
 #define xfs_errortag_del(mp)
 #define XFS_TEST_ERROR(expr, mp, tag)		(expr)
+#define XFS_ERRORTAG_DELAY(mp, tag)		((void)0)
 #define xfs_errortag_set(mp, tag, val)		(ENOSYS)
 #define xfs_errortag_add(mp, tag)		(ENOSYS)
 #define xfs_errortag_clearall(mp)		(ENOSYS)
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index d269ef57ff01..8a5dc1538aa8 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -34,6 +34,8 @@
 #include "xfs_ag.h"
 #include "xfs_ag_resv.h"
 #include "xfs_error.h"
+#include <linux/iomap.h>
+#include "xfs_iomap.h"
 
 /*
  * We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 372d871bccc5..c9ada9577a4a 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3352,6 +3352,50 @@ DEFINE_EVENT(xfs_inode_irec_class, name, \
 	TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec), \
 	TP_ARGS(ip, irec))
 
+/* inode iomap invalidation events */
+DECLARE_EVENT_CLASS(xfs_wb_invalid_class,
+	TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap, unsigned int wpcseq, int whichfork),
+	TP_ARGS(ip, iomap, wpcseq, whichfork),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(u64, addr)
+		__field(loff_t, pos)
+		__field(u64, len)
+		__field(u16, type)
+		__field(u16, flags)
+		__field(u32, wpcseq)
+		__field(u32, forkseq)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->addr = iomap->addr;
+		__entry->pos = iomap->offset;
+		__entry->len = iomap->length;
+		__entry->type = iomap->type;
+		__entry->flags = iomap->flags;
+		__entry->wpcseq = wpcseq;
+		__entry->forkseq = READ_ONCE(xfs_ifork_ptr(ip, whichfork)->if_seq);
+	),
+	TP_printk("dev %d:%d ino 0x%llx pos 0x%llx addr 0x%llx bytecount 0x%llx type 0x%x flags 0x%x wpcseq 0x%x forkseq 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->pos,
+		  __entry->addr,
+		  __entry->len,
+		  __entry->type,
+		  __entry->flags,
+		  __entry->wpcseq,
+		  __entry->forkseq)
+);
+#define DEFINE_WB_INVALID_EVENT(name) \
+DEFINE_EVENT(xfs_wb_invalid_class, name, \
+	TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap, unsigned int wpcseq, int whichfork), \
+	TP_ARGS(ip, iomap, wpcseq, whichfork))
+DEFINE_WB_INVALID_EVENT(xfs_wb_cow_iomap_invalid);
+DEFINE_WB_INVALID_EVENT(xfs_wb_data_iomap_invalid);
+
 /* refcount/reflink tracepoint definitions */
 
 /* reflink tracepoints */
-- 
cgit 


From 254e3459285cbf2174350bbc0051e475e1bc5196 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 28 Nov 2022 17:24:36 -0800
Subject: xfs: add debug knob to slow down write for fun

Add a new error injection knob so that we can arbitrarily slow down
pagecache writes to test for race conditions and aberrant reclaim
behavior if the writeback mechanisms are slow to issue writeback.  This
will enable functional testing for the ifork sequence counters
introduced in commit 304a68b9c63b ("xfs: use iomap_valid method to
detect stale cached iomaps") that fixes write racing with reclaim
writeback.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/libxfs/xfs_errortag.h |  4 +++-
 fs/xfs/xfs_error.c           |  3 +++
 fs/xfs/xfs_iomap.c           | 14 ++++++++++++--
 fs/xfs/xfs_trace.h           | 42 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 60 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index f5f629174eca..01a9e86b3037 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -62,7 +62,8 @@
 #define XFS_ERRTAG_DA_LEAF_SPLIT			40
 #define XFS_ERRTAG_ATTR_LEAF_TO_NODE			41
 #define XFS_ERRTAG_WB_DELAY_MS				42
-#define XFS_ERRTAG_MAX					43
+#define XFS_ERRTAG_WRITE_DELAY_MS			43
+#define XFS_ERRTAG_MAX					44
 
 /*
  * Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -109,5 +110,6 @@
 #define XFS_RANDOM_DA_LEAF_SPLIT			1
 #define XFS_RANDOM_ATTR_LEAF_TO_NODE			1
 #define XFS_RANDOM_WB_DELAY_MS				3000
+#define XFS_RANDOM_WRITE_DELAY_MS			3000
 
 #endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 2d6e3c718e03..713341d246d1 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -61,6 +61,7 @@ static unsigned int xfs_errortag_random_default[] = {
 	XFS_RANDOM_DA_LEAF_SPLIT,
 	XFS_RANDOM_ATTR_LEAF_TO_NODE,
 	XFS_RANDOM_WB_DELAY_MS,
+	XFS_RANDOM_WRITE_DELAY_MS,
 };
 
 struct xfs_errortag_attr {
@@ -177,6 +178,7 @@ XFS_ERRORTAG_ATTR_RW(larp,		XFS_ERRTAG_LARP);
 XFS_ERRORTAG_ATTR_RW(da_leaf_split,	XFS_ERRTAG_DA_LEAF_SPLIT);
 XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node,	XFS_ERRTAG_ATTR_LEAF_TO_NODE);
 XFS_ERRORTAG_ATTR_RW(wb_delay_ms,	XFS_ERRTAG_WB_DELAY_MS);
+XFS_ERRORTAG_ATTR_RW(write_delay_ms,	XFS_ERRTAG_WRITE_DELAY_MS);
 
 static struct attribute *xfs_errortag_attrs[] = {
 	XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -221,6 +223,7 @@ static struct attribute *xfs_errortag_attrs[] = {
 	XFS_ERRORTAG_ATTR_LIST(da_leaf_split),
 	XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node),
 	XFS_ERRORTAG_ATTR_LIST(wb_delay_ms),
+	XFS_ERRORTAG_ATTR_LIST(write_delay_ms),
 	NULL,
 };
 ATTRIBUTE_GROUPS(xfs_errortag);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 1bdd7afc1010..1005f1e36545 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -27,6 +27,8 @@
 #include "xfs_dquot_item.h"
 #include "xfs_dquot.h"
 #include "xfs_reflink.h"
+#include "xfs_error.h"
+#include "xfs_errortag.h"
 
 #define XFS_ALLOC_ALIGN(mp, off) \
 	(((off) >> mp->m_allocsize_log) << mp->m_allocsize_log)
@@ -71,8 +73,16 @@ xfs_iomap_valid(
 	struct inode		*inode,
 	const struct iomap	*iomap)
 {
-	return iomap->validity_cookie ==
-			xfs_iomap_inode_sequence(XFS_I(inode), iomap->flags);
+	struct xfs_inode	*ip = XFS_I(inode);
+
+	if (iomap->validity_cookie !=
+			xfs_iomap_inode_sequence(ip, iomap->flags)) {
+		trace_xfs_iomap_invalid(ip, iomap);
+		return false;
+	}
+
+	XFS_ERRORTAG_DELAY(ip->i_mount, XFS_ERRTAG_WRITE_DELAY_MS);
+	return true;
 }
 
 const struct iomap_page_ops xfs_iomap_page_ops = {
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index c9ada9577a4a..421d1e504ac4 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3396,6 +3396,48 @@ DEFINE_EVENT(xfs_wb_invalid_class, name, \
 DEFINE_WB_INVALID_EVENT(xfs_wb_cow_iomap_invalid);
 DEFINE_WB_INVALID_EVENT(xfs_wb_data_iomap_invalid);
 
+DECLARE_EVENT_CLASS(xfs_iomap_invalid_class,
+	TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap),
+	TP_ARGS(ip, iomap),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(u64, addr)
+		__field(loff_t, pos)
+		__field(u64, len)
+		__field(u64, validity_cookie)
+		__field(u64, inodeseq)
+		__field(u16, type)
+		__field(u16, flags)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->addr = iomap->addr;
+		__entry->pos = iomap->offset;
+		__entry->len = iomap->length;
+		__entry->validity_cookie = iomap->validity_cookie;
+		__entry->type = iomap->type;
+		__entry->flags = iomap->flags;
+		__entry->inodeseq = xfs_iomap_inode_sequence(ip, iomap->flags);
+	),
+	TP_printk("dev %d:%d ino 0x%llx pos 0x%llx addr 0x%llx bytecount 0x%llx type 0x%x flags 0x%x validity_cookie 0x%llx inodeseq 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->pos,
+		  __entry->addr,
+		  __entry->len,
+		  __entry->type,
+		  __entry->flags,
+		  __entry->validity_cookie,
+		  __entry->inodeseq)
+);
+#define DEFINE_IOMAP_INVALID_EVENT(name) \
+DEFINE_EVENT(xfs_iomap_invalid_class, name, \
+	TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap), \
+	TP_ARGS(ip, iomap))
+DEFINE_IOMAP_INVALID_EVENT(xfs_iomap_invalid);
+
 /* refcount/reflink tracepoint definitions */
 
 /* reflink tracepoints */
-- 
cgit 


From ca57f02295f188d6c65ec02202402979880fa6d8 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 28 Nov 2022 22:02:56 +0000
Subject: afs: Fix fileserver probe RTT handling

The fileserver probing code attempts to work out the best fileserver to
use for a volume by retrieving the RTT calculated by AF_RXRPC for the
probe call sent to each server and comparing them.  Sometimes, however,
no RTT estimate is available and rxrpc_kernel_get_srtt() returns false,
leading good fileservers to be given an RTT of UINT_MAX and thus causing
the rotation algorithm to ignore them.

Fix afs_select_fileserver() to ignore rxrpc_kernel_get_srtt()'s return
value and just take the estimated RTT it provides - which will be capped
at 1 second.

Fixes: 1d4adfaf6574 ("rxrpc: Make rxrpc_kernel_get_srtt() indicate validity")
Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Marc Dionne <marc.dionne@auristor.com>
Tested-by: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
Link: https://lore.kernel.org/r/166965503999.3392585.13954054113218099395.stgit@warthog.procyon.org.uk/
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/afs/fs_probe.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
index c0031a3ab42f..3ac5fcf98d0d 100644
--- a/fs/afs/fs_probe.c
+++ b/fs/afs/fs_probe.c
@@ -167,8 +167,8 @@ responded:
 			clear_bit(AFS_SERVER_FL_HAS_FS64, &server->flags);
 	}
 
-	if (rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us) &&
-	    rtt_us < server->probe.rtt) {
+	rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us);
+	if (rtt_us < server->probe.rtt) {
 		server->probe.rtt = rtt_us;
 		server->rtt = rtt_us;
 		alist->preferred = index;
-- 
cgit 


From 98dc08bae6780bb950b5c0cdefeb662b22482655 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 28 Nov 2022 23:04:01 -0800
Subject: fsverity: stop using PG_error to track error status

As a step towards freeing the PG_error flag for other uses, change ext4
and f2fs to stop using PG_error to track verity errors.  Instead, if a
verity error occurs, just mark the whole bio as failed.  The coarser
granularity isn't really a problem since it isn't any worse than what
the block layer provides, and errors from a multi-page readahead aren't
reported to applications unless a single-page read fails too.

f2fs supports compression, which makes the f2fs changes a bit more
complicated than desired, but the basic premise still works.

Note: there are still a few uses of PageError in f2fs, but they are on
the write path, so they are unrelated and this patch doesn't touch them.

Reviewed-by: Chao Yu <chao@kernel.org>
Acked-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20221129070401.156114-1-ebiggers@kernel.org
---
 fs/ext4/readpage.c |  8 ++-----
 fs/f2fs/compress.c | 64 ++++++++++++++++++++++++++----------------------------
 fs/f2fs/data.c     | 53 +++++++++++++++++++++++++++-----------------
 fs/verity/verify.c | 12 +++++-----
 4 files changed, 72 insertions(+), 65 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 3d21eae267fc..e604ea4e102b 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -75,14 +75,10 @@ static void __read_end_io(struct bio *bio)
 	bio_for_each_segment_all(bv, bio, iter_all) {
 		page = bv->bv_page;
 
-		/* PG_error was set if verity failed. */
-		if (bio->bi_status || PageError(page)) {
+		if (bio->bi_status)
 			ClearPageUptodate(page);
-			/* will re-read again later */
-			ClearPageError(page);
-		} else {
+		else
 			SetPageUptodate(page);
-		}
 		unlock_page(page);
 	}
 	if (bio->bi_private)
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index d315c2de136f..2b7a5cc4ed66 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -1711,50 +1711,27 @@ static void f2fs_put_dic(struct decompress_io_ctx *dic, bool in_task)
 	}
 }
 
-/*
- * Update and unlock the cluster's pagecache pages, and release the reference to
- * the decompress_io_ctx that was being held for I/O completion.
- */
-static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed,
-				bool in_task)
+static void f2fs_verify_cluster(struct work_struct *work)
 {
+	struct decompress_io_ctx *dic =
+		container_of(work, struct decompress_io_ctx, verity_work);
 	int i;
 
+	/* Verify, update, and unlock the decompressed pages. */
 	for (i = 0; i < dic->cluster_size; i++) {
 		struct page *rpage = dic->rpages[i];
 
 		if (!rpage)
 			continue;
 
-		/* PG_error was set if verity failed. */
-		if (failed || PageError(rpage)) {
-			ClearPageUptodate(rpage);
-			/* will re-read again later */
-			ClearPageError(rpage);
-		} else {
+		if (fsverity_verify_page(rpage))
 			SetPageUptodate(rpage);
-		}
+		else
+			ClearPageUptodate(rpage);
 		unlock_page(rpage);
 	}
 
-	f2fs_put_dic(dic, in_task);
-}
-
-static void f2fs_verify_cluster(struct work_struct *work)
-{
-	struct decompress_io_ctx *dic =
-		container_of(work, struct decompress_io_ctx, verity_work);
-	int i;
-
-	/* Verify the cluster's decompressed pages with fs-verity. */
-	for (i = 0; i < dic->cluster_size; i++) {
-		struct page *rpage = dic->rpages[i];
-
-		if (rpage && !fsverity_verify_page(rpage))
-			SetPageError(rpage);
-	}
-
-	__f2fs_decompress_end_io(dic, false, true);
+	f2fs_put_dic(dic, true);
 }
 
 /*
@@ -1764,6 +1741,8 @@ static void f2fs_verify_cluster(struct work_struct *work)
 void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed,
 				bool in_task)
 {
+	int i;
+
 	if (!failed && dic->need_verity) {
 		/*
 		 * Note that to avoid deadlocks, the verity work can't be done
@@ -1773,9 +1752,28 @@ void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed,
 		 */
 		INIT_WORK(&dic->verity_work, f2fs_verify_cluster);
 		fsverity_enqueue_verify_work(&dic->verity_work);
-	} else {
-		__f2fs_decompress_end_io(dic, failed, in_task);
+		return;
+	}
+
+	/* Update and unlock the cluster's pagecache pages. */
+	for (i = 0; i < dic->cluster_size; i++) {
+		struct page *rpage = dic->rpages[i];
+
+		if (!rpage)
+			continue;
+
+		if (failed)
+			ClearPageUptodate(rpage);
+		else
+			SetPageUptodate(rpage);
+		unlock_page(rpage);
 	}
+
+	/*
+	 * Release the reference to the decompress_io_ctx that was being held
+	 * for I/O completion.
+	 */
+	f2fs_put_dic(dic, in_task);
 }
 
 /*
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index a71e818cd67b..7af75041bd81 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -116,43 +116,56 @@ struct bio_post_read_ctx {
 	struct f2fs_sb_info *sbi;
 	struct work_struct work;
 	unsigned int enabled_steps;
+	/*
+	 * decompression_attempted keeps track of whether
+	 * f2fs_end_read_compressed_page() has been called on the pages in the
+	 * bio that belong to a compressed cluster yet.
+	 */
+	bool decompression_attempted;
 	block_t fs_blkaddr;
 };
 
+/*
+ * Update and unlock a bio's pages, and free the bio.
+ *
+ * This marks pages up-to-date only if there was no error in the bio (I/O error,
+ * decryption error, or verity error), as indicated by bio->bi_status.
+ *
+ * "Compressed pages" (pagecache pages backed by a compressed cluster on-disk)
+ * aren't marked up-to-date here, as decompression is done on a per-compression-
+ * cluster basis rather than a per-bio basis.  Instead, we only must do two
+ * things for each compressed page here: call f2fs_end_read_compressed_page()
+ * with failed=true if an error occurred before it would have normally gotten
+ * called (i.e., I/O error or decryption error, but *not* verity error), and
+ * release the bio's reference to the decompress_io_ctx of the page's cluster.
+ */
 static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
 {
 	struct bio_vec *bv;
 	struct bvec_iter_all iter_all;
+	struct bio_post_read_ctx *ctx = bio->bi_private;
 
-	/*
-	 * Update and unlock the bio's pagecache pages, and put the
-	 * decompression context for any compressed pages.
-	 */
 	bio_for_each_segment_all(bv, bio, iter_all) {
 		struct page *page = bv->bv_page;
 
 		if (f2fs_is_compressed_page(page)) {
-			if (bio->bi_status)
+			if (ctx && !ctx->decompression_attempted)
 				f2fs_end_read_compressed_page(page, true, 0,
 							in_task);
 			f2fs_put_page_dic(page, in_task);
 			continue;
 		}
 
-		/* PG_error was set if verity failed. */
-		if (bio->bi_status || PageError(page)) {
+		if (bio->bi_status)
 			ClearPageUptodate(page);
-			/* will re-read again later */
-			ClearPageError(page);
-		} else {
+		else
 			SetPageUptodate(page);
-		}
 		dec_page_count(F2FS_P_SB(page), __read_io_type(page));
 		unlock_page(page);
 	}
 
-	if (bio->bi_private)
-		mempool_free(bio->bi_private, bio_post_read_ctx_pool);
+	if (ctx)
+		mempool_free(ctx, bio_post_read_ctx_pool);
 	bio_put(bio);
 }
 
@@ -185,8 +198,10 @@ static void f2fs_verify_bio(struct work_struct *work)
 			struct page *page = bv->bv_page;
 
 			if (!f2fs_is_compressed_page(page) &&
-			    !fsverity_verify_page(page))
-				SetPageError(page);
+			    !fsverity_verify_page(page)) {
+				bio->bi_status = BLK_STS_IOERR;
+				break;
+			}
 		}
 	} else {
 		fsverity_verify_bio(bio);
@@ -245,6 +260,8 @@ static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx,
 		blkaddr++;
 	}
 
+	ctx->decompression_attempted = true;
+
 	/*
 	 * Optimization: if all the bio's pages are compressed, then scheduling
 	 * the per-bio verity work is unnecessary, as verity will be fully
@@ -1062,6 +1079,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
 		ctx->sbi = sbi;
 		ctx->enabled_steps = post_read_steps;
 		ctx->fs_blkaddr = blkaddr;
+		ctx->decompression_attempted = false;
 		bio->bi_private = ctx;
 	}
 	iostat_alloc_and_bind_ctx(sbi, bio, ctx);
@@ -1089,7 +1107,6 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page,
 		bio_put(bio);
 		return -EFAULT;
 	}
-	ClearPageError(page);
 	inc_page_count(sbi, F2FS_RD_DATA);
 	f2fs_update_iostat(sbi, NULL, FS_DATA_READ_IO, F2FS_BLKSIZE);
 	__submit_bio(sbi, bio, DATA);
@@ -2141,7 +2158,6 @@ submit_and_realloc:
 	inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA);
 	f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO,
 							F2FS_BLKSIZE);
-	ClearPageError(page);
 	*last_block_in_bio = block_nr;
 	goto out;
 out:
@@ -2289,7 +2305,6 @@ submit_and_realloc:
 
 		inc_page_count(sbi, F2FS_RD_DATA);
 		f2fs_update_iostat(sbi, inode, FS_DATA_READ_IO, F2FS_BLKSIZE);
-		ClearPageError(page);
 		*last_block_in_bio = blkaddr;
 	}
 
@@ -2306,7 +2321,6 @@ out:
 	for (i = 0; i < cc->cluster_size; i++) {
 		if (cc->rpages[i]) {
 			ClearPageUptodate(cc->rpages[i]);
-			ClearPageError(cc->rpages[i]);
 			unlock_page(cc->rpages[i]);
 		}
 	}
@@ -2403,7 +2417,6 @@ read_single_page:
 #ifdef CONFIG_F2FS_FS_COMPRESSION
 set_error_page:
 #endif
-			SetPageError(page);
 			zero_user_segment(page, 0, PAGE_SIZE);
 			unlock_page(page);
 		}
diff --git a/fs/verity/verify.c b/fs/verity/verify.c
index bde8c9b7d25f..961ba248021f 100644
--- a/fs/verity/verify.c
+++ b/fs/verity/verify.c
@@ -200,9 +200,8 @@ EXPORT_SYMBOL_GPL(fsverity_verify_page);
  * @bio: the bio to verify
  *
  * Verify a set of pages that have just been read from a verity file.  The pages
- * must be pagecache pages that are still locked and not yet uptodate.  Pages
- * that fail verification are set to the Error state.  Verification is skipped
- * for pages already in the Error state, e.g. due to fscrypt decryption failure.
+ * must be pagecache pages that are still locked and not yet uptodate.  If a
+ * page fails verification, then bio->bi_status is set to an error status.
  *
  * This is a helper function for use by the ->readahead() method of filesystems
  * that issue bios to read data directly into the page cache.  Filesystems that
@@ -244,9 +243,10 @@ void fsverity_verify_bio(struct bio *bio)
 		unsigned long level0_ra_pages =
 			min(max_ra_pages, params->level0_blocks - level0_index);
 
-		if (!PageError(page) &&
-		    !verify_page(inode, vi, req, page, level0_ra_pages))
-			SetPageError(page);
+		if (!verify_page(inode, vi, req, page, level0_ra_pages)) {
+			bio->bi_status = BLK_STS_IOERR;
+			break;
+		}
 	}
 
 	fsverity_free_hash_request(params->hash_alg, req);
-- 
cgit 


From b0284cd29a957e62d60c2886fd663be93c56f9c0 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 3 Nov 2022 18:10:34 -0700
Subject: mm: Do not enable PG_arch_2 for all 64-bit architectures

Commit 4beba9486abd ("mm: Add PG_arch_2 page flag") introduced a new
page flag for all 64-bit architectures. However, even if an architecture
is 64-bit, it may still have limited spare bits in the 'flags' member of
'struct page'. This may happen if an architecture enables SPARSEMEM
without SPARSEMEM_VMEMMAP as is the case with the newly added loongarch.
This architecture port needs 19 more bits for the sparsemem section
information and, while it is currently fine with PG_arch_2, adding any
more PG_arch_* flags will trigger build-time warnings.

Add a new CONFIG_ARCH_USES_PG_ARCH_X option which can be selected by
architectures that need more PG_arch_* flags beyond PG_arch_1. Select it
on arm64.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
[pcc@google.com: fix build with CONFIG_ARM64_MTE disabled]
Signed-off-by: Peter Collingbourne <pcc@google.com>
Reported-by: kernel test robot <lkp@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Steven Price <steven.price@arm.com>
Reviewed-by: Steven Price <steven.price@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20221104011041.290951-2-pcc@google.com
---
 fs/proc/page.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/page.c b/fs/proc/page.c
index f2273b164535..882525c8e94c 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -219,7 +219,7 @@ u64 stable_page_flags(struct page *page)
 	u |= kpf_copy_bit(k, KPF_PRIVATE_2,	PG_private_2);
 	u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE,	PG_owner_priv_1);
 	u |= kpf_copy_bit(k, KPF_ARCH,		PG_arch_1);
-#ifdef CONFIG_64BIT
+#ifdef CONFIG_ARCH_USES_PG_ARCH_X
 	u |= kpf_copy_bit(k, KPF_ARCH_2,	PG_arch_2);
 #endif
 
-- 
cgit 


From ef6458b1b6ca3fdb991ce4182e981a88d4c58c0f Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <pcc@google.com>
Date: Thu, 3 Nov 2022 18:10:37 -0700
Subject: mm: Add PG_arch_3 page flag

As with PG_arch_2, this flag is only allowed on 64-bit architectures due
to the shortage of bits available. It will be used by the arm64 MTE code
in subsequent patches.

Signed-off-by: Peter Collingbourne <pcc@google.com>
Cc: Will Deacon <will@kernel.org>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Steven Price <steven.price@arm.com>
[catalin.marinas@arm.com: added flag preserving in __split_huge_page_tail()]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Steven Price <steven.price@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20221104011041.290951-5-pcc@google.com
---
 fs/proc/page.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/proc/page.c b/fs/proc/page.c
index 882525c8e94c..6249c347809a 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -221,6 +221,7 @@ u64 stable_page_flags(struct page *page)
 	u |= kpf_copy_bit(k, KPF_ARCH,		PG_arch_1);
 #ifdef CONFIG_ARCH_USES_PG_ARCH_X
 	u |= kpf_copy_bit(k, KPF_ARCH_2,	PG_arch_2);
+	u |= kpf_copy_bit(k, KPF_ARCH_3,	PG_arch_3);
 #endif
 
 	return u;
-- 
cgit 


From 318cdc822c63b6e2befcfdc2088378ae6fa18def Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Wed, 29 Jun 2022 19:26:47 +0800
Subject: ext4: check and assert if marking an no_delete evicting inode dirty

In ext4_evict_inode(), if we evicting an inode in the 'no_delete' path,
it cannot be raced by another mark_inode_dirty(). If it happens,
someone else may accidentally dirty it without holding inode refcount
and probably cause use-after-free issues in the writeback procedure.
It's indiscoverable and hard to debug, so add an WARN_ON_ONCE() to
check and detect this issue in advance.

Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20220629112647.4141034-2-yi.zhang@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/inode.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7c5f5dabe0fd..121ce2fccfab 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -335,6 +335,12 @@ stop_handle:
 	ext4_xattr_inode_array_free(ea_inode_array);
 	return;
 no_delete:
+	/*
+	 * Check out some where else accidentally dirty the evicting inode,
+	 * which may probably cause inode use-after-free issues later.
+	 */
+	WARN_ON_ONCE(!list_empty_careful(&inode->i_io_list));
+
 	if (!list_empty(&EXT4_I(inode)->i_fc_list))
 		ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
 	ext4_clear_inode(inode);	/* We must guarantee clearing of inode... */
-- 
cgit 


From 66267814ba0ee0732c69ca87eb1fd6eb63bf0d5f Mon Sep 17 00:00:00 2001
From: Jiangshan Yi <yijiangshan@kylinos.cn>
Date: Wed, 17 Aug 2022 10:59:28 +0800
Subject: fs/ext4: replace ternary operator with min()/max() and min_t()

Fix the following coccicheck warning:

fs/ext4/inline.c:183: WARNING opportunity for min().
fs/ext4/extents.c:2631: WARNING opportunity for max().
fs/ext4/extents.c:2632: WARNING opportunity for min().
fs/ext4/extents.c:5559: WARNING opportunity for max().
fs/ext4/super.c:6908: WARNING opportunity for min().

min()/max() and min_t() macro is defined in include/linux/minmax.h.
It avoids multiple evaluations of the arguments when non-constant and
performs strict type-checking.

Reported-by: kernel test robot <lkp@intel.com>
Suggested-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: Jiangshan Yi <yijiangshan@kylinos.cn>
Reviewed-by: Lukas Czerner <lczerner@redhat.com>
Link: https://lore.kernel.org/r/20220817025928.612851-1-13667453960@163.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/extents.c | 8 +++-----
 fs/ext4/inline.c  | 3 +--
 fs/ext4/super.c   | 3 +--
 3 files changed, 5 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 6c399a8b22b3..ec008278d970 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2635,9 +2635,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 			  unwritten, ex_ee_len);
 		path[depth].p_ext = ex;
 
-		a = ex_ee_block > start ? ex_ee_block : start;
-		b = ex_ee_block+ex_ee_len - 1 < end ?
-			ex_ee_block+ex_ee_len - 1 : end;
+		a = max(ex_ee_block, start);
+		b = min(ex_ee_block + ex_ee_len - 1, end);
 
 		ext_debug(inode, "  border %u:%u\n", a, b);
 
@@ -5567,8 +5566,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
 	 * ee_start_lblk to shift extents
 	 */
 	ret = ext4_ext_shift_extents(inode, handle,
-		ee_start_lblk > offset_lblk ? ee_start_lblk : offset_lblk,
-		len_lblk, SHIFT_RIGHT);
+		max(ee_start_lblk, offset_lblk), len_lblk, SHIFT_RIGHT);
 
 	up_write(&EXT4_I(inode)->i_data_sem);
 	if (IS_SYNC(inode))
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index a4fbe825694b..2b42ececa46d 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -180,8 +180,7 @@ static int ext4_read_inline_data(struct inode *inode, void *buffer,
 
 	BUG_ON(len > EXT4_I(inode)->i_inline_size);
 
-	cp_len = len < EXT4_MIN_INLINE_DATA_SIZE ?
-			len : EXT4_MIN_INLINE_DATA_SIZE;
+	cp_len = min_t(unsigned int, len, EXT4_MIN_INLINE_DATA_SIZE);
 
 	raw_inode = ext4_raw_inode(iloc);
 	memcpy(buffer, (void *)(raw_inode->i_block), cp_len);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7cdd2138c897..4749b303f2a9 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -7031,8 +7031,7 @@ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
 		len = i_size-off;
 	toread = len;
 	while (toread > 0) {
-		tocopy = sb->s_blocksize - offset < toread ?
-				sb->s_blocksize - offset : toread;
+		tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread);
 		bh = ext4_bread(NULL, inode, blk, 0);
 		if (IS_ERR(bh))
 			return PTR_ERR(bh);
-- 
cgit 


From a4bbf53d88c728da9ff6c316b1e4ded63a8f3940 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 28 Nov 2022 20:51:39 -0800
Subject: fsverity: simplify fsverity_get_digest()

Instead of looking up the algorithm by name in hash_algo_name[] to get
its hash_algo ID, just store the hash_algo ID in the fsverity_hash_alg
struct.  Verify at boot time that every fsverity_hash_alg has a valid
hash_algo ID with matching digest size.

Remove an unnecessary memset() of the whole digest array to 0 before the
digest is copied into it.

Finally, remove the pr_debug statement.  There is already a pr_debug for
the fsverity digest when the file is opened.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Mimi Zohar <zohar@linux.ibm.com>
Link: https://lore.kernel.org/r/20221129045139.69803-1-ebiggers@kernel.org
---
 fs/verity/fsverity_private.h |  5 +++++
 fs/verity/hash_algs.c        |  6 ++++++
 fs/verity/measure.c          | 19 ++-----------------
 3 files changed, 13 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
index dbe1ce5b450a..c7fcb855e068 100644
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@@ -32,6 +32,11 @@ struct fsverity_hash_alg {
 	unsigned int digest_size; /* digest size in bytes, e.g. 32 for SHA-256 */
 	unsigned int block_size;  /* block size in bytes, e.g. 64 for SHA-256 */
 	mempool_t req_pool;	  /* mempool with a preallocated hash request */
+	/*
+	 * The HASH_ALGO_* constant for this algorithm.  This is different from
+	 * FS_VERITY_HASH_ALG_*, which uses a different numbering scheme.
+	 */
+	enum hash_algo algo_id;
 };
 
 /* Merkle tree parameters: hash algorithm, initial hash state, and topology */
diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c
index 71d0fccb6d4c..6f8170cf4ae7 100644
--- a/fs/verity/hash_algs.c
+++ b/fs/verity/hash_algs.c
@@ -16,11 +16,13 @@ struct fsverity_hash_alg fsverity_hash_algs[] = {
 		.name = "sha256",
 		.digest_size = SHA256_DIGEST_SIZE,
 		.block_size = SHA256_BLOCK_SIZE,
+		.algo_id = HASH_ALGO_SHA256,
 	},
 	[FS_VERITY_HASH_ALG_SHA512] = {
 		.name = "sha512",
 		.digest_size = SHA512_DIGEST_SIZE,
 		.block_size = SHA512_BLOCK_SIZE,
+		.algo_id = HASH_ALGO_SHA512,
 	},
 };
 
@@ -324,5 +326,9 @@ void __init fsverity_check_hash_algs(void)
 		 */
 		BUG_ON(!is_power_of_2(alg->digest_size));
 		BUG_ON(!is_power_of_2(alg->block_size));
+
+		/* Verify that there is a valid mapping to HASH_ALGO_*. */
+		BUG_ON(alg->algo_id == 0);
+		BUG_ON(alg->digest_size != hash_digest_size[alg->algo_id]);
 	}
 }
diff --git a/fs/verity/measure.c b/fs/verity/measure.c
index e99c00350c28..5c79ea1b2468 100644
--- a/fs/verity/measure.c
+++ b/fs/verity/measure.c
@@ -65,8 +65,7 @@ EXPORT_SYMBOL_GPL(fsverity_ioctl_measure);
  * @alg: (out) pointer to the hash algorithm enumeration
  *
  * Return the file hash algorithm and digest of an fsverity protected file.
- * Assumption: before calling fsverity_get_digest(), the file must have been
- * opened.
+ * Assumption: before calling this, the file must have been opened.
  *
  * Return: 0 on success, -errno on failure
  */
@@ -76,27 +75,13 @@ int fsverity_get_digest(struct inode *inode,
 {
 	const struct fsverity_info *vi;
 	const struct fsverity_hash_alg *hash_alg;
-	int i;
 
 	vi = fsverity_get_info(inode);
 	if (!vi)
 		return -ENODATA; /* not a verity file */
 
 	hash_alg = vi->tree_params.hash_alg;
-	memset(digest, 0, FS_VERITY_MAX_DIGEST_SIZE);
-
-	/* convert the verity hash algorithm name to a hash_algo_name enum */
-	i = match_string(hash_algo_name, HASH_ALGO__LAST, hash_alg->name);
-	if (i < 0)
-		return -EINVAL;
-	*alg = i;
-
-	if (WARN_ON_ONCE(hash_alg->digest_size != hash_digest_size[*alg]))
-		return -EINVAL;
 	memcpy(digest, vi->file_digest, hash_alg->digest_size);
-
-	pr_debug("file digest %s:%*phN\n", hash_algo_name[*alg],
-		 hash_digest_size[*alg], digest);
-
+	*alg = hash_alg->algo_id;
 	return 0;
 }
-- 
cgit 


From ab1ddef98a715eddb65309ffa83267e4e84a571e Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 14 Nov 2022 08:33:09 -0500
Subject: filelock: new helper: vfs_inode_has_locks

Ceph has a need to know whether a particular inode has any locks set on
it. It's currently tracking that by a num_locks field in its
filp->private_data, but that's problematic as it tries to decrement this
field when releasing locks and that can race with the file being torn
down.

Add a new vfs_inode_has_locks helper that just returns whether any locks
are currently held on the inode.

Reviewed-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/locks.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index 5876c8ff0edc..9ccf89b6c95d 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2672,6 +2672,29 @@ int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
 }
 EXPORT_SYMBOL_GPL(vfs_cancel_lock);
 
+/**
+ * vfs_inode_has_locks - are any file locks held on @inode?
+ * @inode: inode to check for locks
+ *
+ * Return true if there are any FL_POSIX or FL_FLOCK locks currently
+ * set on @inode.
+ */
+bool vfs_inode_has_locks(struct inode *inode)
+{
+	struct file_lock_context *ctx;
+	bool ret;
+
+	ctx = smp_load_acquire(&inode->i_flctx);
+	if (!ctx)
+		return false;
+
+	spin_lock(&ctx->flc_lock);
+	ret = !list_empty(&ctx->flc_posix) || !list_empty(&ctx->flc_flock);
+	spin_unlock(&ctx->flc_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(vfs_inode_has_locks);
+
 #ifdef CONFIG_PROC_FS
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-- 
cgit 


From 401a8b8fd5acd51582b15238d72a8d0edd580e9f Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 16 Nov 2022 09:02:30 -0500
Subject: filelock: add a new locks_inode_context accessor function

There are a number of places in the kernel that are accessing the
inode->i_flctx field without smp_load_acquire. This is required to
ensure that the caller doesn't see a partially-initialized structure.

Add a new accessor function for it to make this clear and convert all of
the relevant accesses in locks.c to use it. Also, convert
locks_free_lock_context to use the helper as well instead of just doing
a "bare" assignment.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/locks.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index 9ccf89b6c95d..8e48b3d5afe3 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -175,7 +175,7 @@ locks_get_lock_context(struct inode *inode, int type)
 	struct file_lock_context *ctx;
 
 	/* paired with cmpxchg() below */
-	ctx = smp_load_acquire(&inode->i_flctx);
+	ctx = locks_inode_context(inode);
 	if (likely(ctx) || type == F_UNLCK)
 		goto out;
 
@@ -194,7 +194,7 @@ locks_get_lock_context(struct inode *inode, int type)
 	 */
 	if (cmpxchg(&inode->i_flctx, NULL, ctx)) {
 		kmem_cache_free(flctx_cache, ctx);
-		ctx = smp_load_acquire(&inode->i_flctx);
+		ctx = locks_inode_context(inode);
 	}
 out:
 	trace_locks_get_lock_context(inode, type, ctx);
@@ -247,7 +247,7 @@ locks_check_ctx_file_list(struct file *filp, struct list_head *list,
 void
 locks_free_lock_context(struct inode *inode)
 {
-	struct file_lock_context *ctx = inode->i_flctx;
+	struct file_lock_context *ctx = locks_inode_context(inode);
 
 	if (unlikely(ctx)) {
 		locks_check_ctx_lists(inode);
@@ -891,7 +891,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
 	void *owner;
 	void (*func)(void);
 
-	ctx = smp_load_acquire(&inode->i_flctx);
+	ctx = locks_inode_context(inode);
 	if (!ctx || list_empty_careful(&ctx->flc_posix)) {
 		fl->fl_type = F_UNLCK;
 		return;
@@ -1483,7 +1483,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 	new_fl->fl_flags = type;
 
 	/* typically we will check that ctx is non-NULL before calling */
-	ctx = smp_load_acquire(&inode->i_flctx);
+	ctx = locks_inode_context(inode);
 	if (!ctx) {
 		WARN_ON_ONCE(1);
 		goto free_lock;
@@ -1588,7 +1588,7 @@ void lease_get_mtime(struct inode *inode, struct timespec64 *time)
 	struct file_lock_context *ctx;
 	struct file_lock *fl;
 
-	ctx = smp_load_acquire(&inode->i_flctx);
+	ctx = locks_inode_context(inode);
 	if (ctx && !list_empty_careful(&ctx->flc_lease)) {
 		spin_lock(&ctx->flc_lock);
 		fl = list_first_entry_or_null(&ctx->flc_lease,
@@ -1634,7 +1634,7 @@ int fcntl_getlease(struct file *filp)
 	int type = F_UNLCK;
 	LIST_HEAD(dispose);
 
-	ctx = smp_load_acquire(&inode->i_flctx);
+	ctx = locks_inode_context(inode);
 	if (ctx && !list_empty_careful(&ctx->flc_lease)) {
 		percpu_down_read(&file_rwsem);
 		spin_lock(&ctx->flc_lock);
@@ -1823,7 +1823,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
 	struct file_lock_context *ctx;
 	LIST_HEAD(dispose);
 
-	ctx = smp_load_acquire(&inode->i_flctx);
+	ctx = locks_inode_context(inode);
 	if (!ctx) {
 		trace_generic_delete_lease(inode, NULL);
 		return error;
@@ -2563,7 +2563,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
 	 * posix_lock_file().  Another process could be setting a lock on this
 	 * file at the same time, but we wouldn't remove that lock anyway.
 	 */
-	ctx =  smp_load_acquire(&inode->i_flctx);
+	ctx = locks_inode_context(inode);
 	if (!ctx || list_empty(&ctx->flc_posix))
 		return;
 
@@ -2636,7 +2636,7 @@ void locks_remove_file(struct file *filp)
 {
 	struct file_lock_context *ctx;
 
-	ctx = smp_load_acquire(&locks_inode(filp)->i_flctx);
+	ctx = locks_inode_context(locks_inode(filp));
 	if (!ctx)
 		return;
 
@@ -2684,7 +2684,7 @@ bool vfs_inode_has_locks(struct inode *inode)
 	struct file_lock_context *ctx;
 	bool ret;
 
-	ctx = smp_load_acquire(&inode->i_flctx);
+	ctx = locks_inode_context(inode);
 	if (!ctx)
 		return false;
 
@@ -2865,7 +2865,7 @@ void show_fd_locks(struct seq_file *f,
 	struct file_lock_context *ctx;
 	int id = 0;
 
-	ctx = smp_load_acquire(&inode->i_flctx);
+	ctx = locks_inode_context(inode);
 	if (!ctx)
 		return;
 
-- 
cgit 


From d4e78663f6bc83db44041f224e58e0940662a912 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 16 Nov 2022 08:44:25 -0500
Subject: ceph: use locks_inode_context helper

ceph currently doesn't access i_flctx safely. This requires a
smp_load_acquire, as the pointer is set via cmpxchg (a release
operation).

Reviewed-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/ceph/locks.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 3e2843e86e27..f3b461c708a8 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -364,7 +364,7 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
 	*fcntl_count = 0;
 	*flock_count = 0;
 
-	ctx = inode->i_flctx;
+	ctx = locks_inode_context(inode);
 	if (ctx) {
 		spin_lock(&ctx->flc_lock);
 		list_for_each_entry(lock, &ctx->flc_posix, fl_list)
@@ -418,7 +418,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
 				int num_fcntl_locks, int num_flock_locks)
 {
 	struct file_lock *lock;
-	struct file_lock_context *ctx = inode->i_flctx;
+	struct file_lock_context *ctx = locks_inode_context(inode);
 	int err = 0;
 	int seen_fcntl = 0;
 	int seen_flock = 0;
-- 
cgit 


From a1fde8ee771f92d2a8bbd79532149ddd34546bc1 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 16 Nov 2022 08:49:25 -0500
Subject: cifs: use locks_inode_context helper

cifs currently doesn't access i_flctx safely. This requires a
smp_load_acquire, as the pointer is set via cmpxchg (a release
operation).

Cc: Steve French <smfrench@samba.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/cifs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index cd9698209930..6c1431979495 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1413,7 +1413,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
 	struct inode *inode = d_inode(cfile->dentry);
 	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
 	struct file_lock *flock;
-	struct file_lock_context *flctx = inode->i_flctx;
+	struct file_lock_context *flctx = locks_inode_context(inode);
 	unsigned int count = 0, i;
 	int rc = 0, xid, type;
 	struct list_head locks_to_send, *el;
-- 
cgit 


From 87f00aba211ef7308fd6c5d47d646a70bf196662 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 16 Nov 2022 08:50:52 -0500
Subject: ksmbd: use locks_inode_context helper

ksmbd currently doesn't access i_flctx safely. This requires a
smp_load_acquire, as the pointer is set via cmpxchg (a release
operation).

Cc: Steve French <sfrench@samba.org>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/ksmbd/vfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c
index 8de970d6146f..f9e85d6a160e 100644
--- a/fs/ksmbd/vfs.c
+++ b/fs/ksmbd/vfs.c
@@ -321,7 +321,7 @@ static int check_lock_range(struct file *filp, loff_t start, loff_t end,
 			    unsigned char type)
 {
 	struct file_lock *flock;
-	struct file_lock_context *ctx = file_inode(filp)->i_flctx;
+	struct file_lock_context *ctx = locks_inode_context(file_inode(filp));
 	int error = 0;
 
 	if (!ctx || list_empty_careful(&ctx->flc_posix))
-- 
cgit 


From 98b41ffe0afdfeaa1439a5d6bd2db4a94277e31b Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 16 Nov 2022 09:19:43 -0500
Subject: lockd: use locks_inode_context helper

lockd currently doesn't access i_flctx safely. This requires a
smp_load_acquire, as the pointer is set via cmpxchg (a release
operation).

Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Anna Schumaker <anna@kernel.org>
Cc: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/lockd/svcsubs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index e1c4617de771..720684345817 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -207,7 +207,7 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
 {
 	struct inode	 *inode = nlmsvc_file_inode(file);
 	struct file_lock *fl;
-	struct file_lock_context *flctx = inode->i_flctx;
+	struct file_lock_context *flctx = locks_inode_context(inode);
 	struct nlm_host	 *lockhost;
 
 	if (!flctx || list_empty_careful(&flctx->flc_posix))
@@ -262,7 +262,7 @@ nlm_file_inuse(struct nlm_file *file)
 {
 	struct inode	 *inode = nlmsvc_file_inode(file);
 	struct file_lock *fl;
-	struct file_lock_context *flctx = inode->i_flctx;
+	struct file_lock_context *flctx = locks_inode_context(inode);
 
 	if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares)
 		return 1;
-- 
cgit 


From 17b985def2a859d66d27afee442147468a6a4ea6 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 16 Nov 2022 09:55:36 -0500
Subject: nfs: use locks_inode_context helper

nfs currently doesn't access i_flctx safely. This requires a
smp_load_acquire, as the pointer is set via cmpxchg (a release
operation).

Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Anna Schumaker <anna@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfs/delegation.c | 2 +-
 fs/nfs/nfs4state.c  | 2 +-
 fs/nfs/pagelist.c   | 2 +-
 fs/nfs/write.c      | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index ead8a0e06abf..cf7365581031 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -146,7 +146,7 @@ static int nfs_delegation_claim_locks(struct nfs4_state *state, const nfs4_state
 {
 	struct inode *inode = state->inode;
 	struct file_lock *fl;
-	struct file_lock_context *flctx = inode->i_flctx;
+	struct file_lock_context *flctx = locks_inode_context(inode);
 	struct list_head *list;
 	int status = 0;
 
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index a2d2d5d1b088..dd18344648f3 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1501,7 +1501,7 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
 	struct file_lock *fl;
 	struct nfs4_lock_state *lsp;
 	int status = 0;
-	struct file_lock_context *flctx = inode->i_flctx;
+	struct file_lock_context *flctx = locks_inode_context(inode);
 	struct list_head *list;
 
 	if (flctx == NULL)
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 317cedfa52bf..16be6dae524f 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -1055,7 +1055,7 @@ static unsigned int nfs_coalesce_size(struct nfs_page *prev,
 	if (prev) {
 		if (!nfs_match_open_context(nfs_req_openctx(req), nfs_req_openctx(prev)))
 			return 0;
-		flctx = d_inode(nfs_req_openctx(req)->dentry)->i_flctx;
+		flctx = locks_inode_context(d_inode(nfs_req_openctx(req)->dentry));
 		if (flctx != NULL &&
 		    !(list_empty_careful(&flctx->flc_posix) &&
 		      list_empty_careful(&flctx->flc_flock)) &&
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index f41d24b54fd1..80c240e50952 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1185,7 +1185,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
 {
 	struct nfs_open_context *ctx = nfs_file_open_context(file);
 	struct nfs_lock_context *l_ctx;
-	struct file_lock_context *flctx = file_inode(file)->i_flctx;
+	struct file_lock_context *flctx = locks_inode_context(file_inode(file));
 	struct nfs_page	*req;
 	int do_flush, status;
 	/*
@@ -1321,7 +1321,7 @@ static int nfs_can_extend_write(struct file *file, struct page *page,
 				struct inode *inode, unsigned int pagelen)
 {
 	int ret;
-	struct file_lock_context *flctx = inode->i_flctx;
+	struct file_lock_context *flctx = locks_inode_context(inode);
 	struct file_lock *fl;
 
 	if (file->f_flags & O_DSYNC)
-- 
cgit 


From 77c67530e1f95ac25c7075635f32f04367380894 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 16 Nov 2022 09:36:07 -0500
Subject: nfsd: use locks_inode_context helper

nfsd currently doesn't access i_flctx safely everywhere. This requires a
smp_load_acquire, as the pointer is set via cmpxchg (a release
operation).

Acked-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4state.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 836bd825ca4a..da8d0ea66229 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4758,7 +4758,7 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
 
 static bool nfsd4_deleg_present(const struct inode *inode)
 {
-	struct file_lock_context *ctx = smp_load_acquire(&inode->i_flctx);
+	struct file_lock_context *ctx = locks_inode_context(inode);
 
 	return ctx && !list_empty_careful(&ctx->flc_lease);
 }
@@ -5897,7 +5897,7 @@ nfs4_lockowner_has_blockers(struct nfs4_lockowner *lo)
 
 	list_for_each_entry(stp, &lo->lo_owner.so_stateids, st_perstateowner) {
 		nf = stp->st_stid.sc_file;
-		ctx = nf->fi_inode->i_flctx;
+		ctx = locks_inode_context(nf->fi_inode);
 		if (!ctx)
 			continue;
 		if (locks_owner_has_blockers(ctx, lo))
@@ -7713,7 +7713,7 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
 	}
 
 	inode = locks_inode(nf->nf_file);
-	flctx = inode->i_flctx;
+	flctx = locks_inode_context(inode);
 
 	if (flctx && !list_empty_careful(&flctx->flc_posix)) {
 		spin_lock(&flctx->flc_lock);
-- 
cgit 


From f2f2494c8aa3cc317572c4674ef256005ebc092b Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 18 Nov 2022 15:43:57 -0800
Subject: Add process name and pid to locks warning

It's fairly useless to complain about using an obsolete feature without
telling the user which process used it. My Fedora desktop randomly drops
this message, but I would really need this patch to figure out what
triggers is.

[ jlayton: print pid as well as process name ]

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/locks.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index 8e48b3d5afe3..8f01bee17715 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2096,7 +2096,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
 	 * throw a warning to let people know that they don't actually work.
 	 */
 	if (cmd & LOCK_MAND) {
-		pr_warn_once("Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n");
+		pr_warn_once("%s(%d): Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n", current->comm, current->pid);
 		return 0;
 	}
 
-- 
cgit 


From a79168a0c00d710420c1758f6c38df89e12f0763 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Wed, 30 Nov 2022 08:41:01 -0800
Subject: fs/remap_range: avoid spurious writeback on zero length request

generic_remap_checks() can reduce the effective request length (i.e.,
after the reflink extend to EOF case is handled) down to zero. If this
occurs, __generic_remap_file_range_prep() proceeds through dio
serialization, file mapping flush calls, and may invoke file_modified()
before returning back to the filesystem caller, all of which immediately
check for len == 0 and return.

While this is mostly harmless, it is spurious and not completely
without side effect. A filemap write call can submit I/O (but not
wait on it) when the specified end byte precedes the start but
happens to land on the same aligned page boundary, which can occur
from __generic_remap_file_range_prep() when len is 0.

The dedupe path already has a len == 0 check to break out before
doing range comparisons. Lift this check a bit earlier in the
function to cover the general case of len == 0 and avoid the
unnecessary work. While here, account for the case where
generic_remap_check_len() may also reduce length to zero.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/remap_range.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/remap_range.c b/fs/remap_range.c
index 654912d06862..8741ecd616a7 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -304,7 +304,7 @@ __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 	/* Check that we don't violate system file offset limits. */
 	ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
 			remap_flags);
-	if (ret)
+	if (ret || *len == 0)
 		return ret;
 
 	/* Wait for the completion of any pending IOs on both files */
@@ -328,9 +328,6 @@ __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 	if (remap_flags & REMAP_FILE_DEDUP) {
 		bool		is_same = false;
 
-		if (*len == 0)
-			return 0;
-
 		if (!IS_DAX(inode_in))
 			ret = vfs_dedupe_file_range_compare(file_in, pos_in,
 					file_out, pos_out, *len, &is_same);
@@ -348,7 +345,7 @@ __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 
 	ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
 			remap_flags);
-	if (ret)
+	if (ret || *len == 0)
 		return ret;
 
 	/* If can't alter the file contents, we're done. */
-- 
cgit 


From 032e160305f6872e590c77f11896fb28365c6d6c Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 28 Nov 2022 17:24:42 -0800
Subject: xfs: invalidate block device page cache during unmount

Every now and then I see fstests failures on aarch64 (64k pages) that
trigger on the following sequence:

mkfs.xfs $dev
mount $dev $mnt
touch $mnt/a
umount $mnt
xfs_db -c 'path /a' -c 'print' $dev

99% of the time this succeeds, but every now and then xfs_db cannot find
/a and fails.  This turns out to be a race involving udev/blkid, the
page cache for the block device, and the xfs_db process.

udev is triggered whenever anyone closes a block device or unmounts it.
The default udev rules invoke blkid to read the fs super and create
symlinks to the bdev under /dev/disk.  For this, it uses buffered reads
through the page cache.

xfs_db also uses buffered reads to examine metadata.  There is no
coordination between xfs_db and udev, which means that they can run
concurrently.  Note there is no coordination between the kernel and
blkid either.

On a system with 64k pages, the page cache can cache the superblock and
the root inode (and hence the root dir) with the same 64k page.  If
udev spawns blkid after the mkfs and the system is busy enough that it
is still running when xfs_db starts up, they'll both read from the same
page in the pagecache.

The unmount writes updated inode metadata to disk directly.  The XFS
buffer cache does not use the bdev pagecache, nor does it invalidate the
pagecache on umount.  If the above scenario occurs, the pagecache no
longer reflects what's on disk, xfs_db reads the stale metadata, and
fails to find /a.  Most of the time this succeeds because closing a bdev
invalidates the page cache, but when processes race, everyone loses.

Fix the problem by invalidating the bdev pagecache after flushing the
bdev, so that xfs_db will see up to date metadata.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_buf.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index dde346450952..54c774af6e1c 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1945,6 +1945,7 @@ xfs_free_buftarg(
 	list_lru_destroy(&btp->bt_lru);
 
 	blkdev_issue_flush(btp->bt_bdev);
+	invalidate_bdev(btp->bt_bdev);
 	fs_put_dax(btp->bt_daxdev, btp->bt_mount);
 
 	kmem_free(btp);
-- 
cgit 


From fd5beaff250d7e88912a937fad072d9d24f219da Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 28 Nov 2022 17:24:42 -0800
Subject: xfs: use memcpy, not strncpy, to format the attr prefix during
 listxattr

When -Wstringop-truncation is enabled, the compiler complains about
truncation of the null byte at the end of the xattr name prefix.  This
is intentional, since we're concatenating the two strings together and
do _not_ want a null byte in the middle of the name.

We've already ensured that the name buffer is long enough to handle
prefix and name, and the prefix_len is supposed to be the length of the
prefix string without the null byte, so use memcpy here instead.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_xattr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index c325a28b89a8..10aa1fd39d2b 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -210,7 +210,7 @@ __xfs_xattr_put_listent(
 		return;
 	}
 	offset = context->buffer + context->count;
-	strncpy(offset, prefix, prefix_len);
+	memcpy(offset, prefix, prefix_len);
 	offset += prefix_len;
 	strncpy(offset, (char *)name, namelen);			/* real name */
 	offset += namelen;
-- 
cgit 


From e5827a007aa4bb737c63121fd2c77e089b18a372 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 28 Nov 2022 17:24:42 -0800
Subject: xfs: shut up -Wuninitialized in xfsaild_push

-Wuninitialized complains about @target in xfsaild_push being
uninitialized in the case where the waitqueue is active but there is no
last item in the AIL to wait for.  I /think/ it should never be the case
that the subsequent xfs_trans_ail_cursor_first returns a log item and
hence we'll never end up at XFS_LSN_CMP, but let's make this explicit.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_trans_ail.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index f51df7d94ef7..7d4109af193e 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -422,7 +422,7 @@ xfsaild_push(
 	struct xfs_ail_cursor	cur;
 	struct xfs_log_item	*lip;
 	xfs_lsn_t		lsn;
-	xfs_lsn_t		target;
+	xfs_lsn_t		target = NULLCOMMITLSN;
 	long			tout;
 	int			stuck = 0;
 	int			flushing = 0;
@@ -472,6 +472,8 @@ xfsaild_push(
 
 	XFS_STATS_INC(mp, xs_push_ail);
 
+	ASSERT(target != NULLCOMMITLSN);
+
 	lsn = lip->li_lsn;
 	while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) {
 		int	lock_result;
-- 
cgit 


From 4c6dbfd2756bd83a0085ed804e2bb7be9cc16bc5 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 28 Nov 2022 17:24:43 -0800
Subject: xfs: attach dquots to inode before reading data/cow fork mappings

I've been running near-continuous integration testing of online fsck,
and I've noticed that once a day, one of the ARM VMs will fail the test
with out of order records in the data fork.

xfs/804 races fsstress with online scrub (aka scan but do not change
anything), so I think this might be a bug in the core xfs code.  This
also only seems to trigger if one runs the test for more than ~6 minutes
via TIME_FACTOR=13 or something.
https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfstests-dev.git/tree/tests/xfs/804?h=djwong-wtf

I added a debugging patch to the kernel to check the data fork extents
after taking the ILOCK, before dropping ILOCK, and before and after each
bmapping operation.  So far I've narrowed it down to the delalloc code
inserting a record in the wrong place in the iext tree:

xfs_bmap_add_extent_hole_delay, near line 2691:

	case 0:
		/*
		 * New allocation is not contiguous with another
		 * delayed allocation.
		 * Insert a new entry.
		 */
		oldlen = newlen = 0;
		xfs_iunlock_check_datafork(ip);		<-- ok here
		xfs_iext_insert(ip, icur, new, state);
		xfs_iunlock_check_datafork(ip);		<-- bad here
		break;
	}

I recorded the state of the data fork mappings and iext cursor state
when a corrupt data fork is detected immediately after the
xfs_bmap_add_extent_hole_delay call in xfs_bmapi_reserve_delalloc:

ino 0x140bb3 func xfs_bmapi_reserve_delalloc line 4164 data fork:
    ino 0x140bb3 nr 0x0 nr_real 0x0 offset 0xb9 blockcount 0x1f startblock 0x935de2 state 1
    ino 0x140bb3 nr 0x1 nr_real 0x1 offset 0xe6 blockcount 0xa startblock 0xffffffffe0007 state 0
    ino 0x140bb3 nr 0x2 nr_real 0x1 offset 0xd8 blockcount 0xe startblock 0x935e01 state 0

Here we see that a delalloc extent was inserted into the wrong position
in the iext leaf, same as all the other times.  The extra trace data I
collected are as follows:

ino 0x140bb3 fork 0 oldoff 0xe6 oldlen 0x4 oldprealloc 0x6 isize 0xe6000
    ino 0x140bb3 oldgotoff 0xea oldgotstart 0xfffffffffffffffe oldgotcount 0x0 oldgotstate 0
    ino 0x140bb3 crapgotoff 0x0 crapgotstart 0x0 crapgotcount 0x0 crapgotstate 0
    ino 0x140bb3 freshgotoff 0xd8 freshgotstart 0x935e01 freshgotcount 0xe freshgotstate 0
    ino 0x140bb3 nowgotoff 0xe6 nowgotstart 0xffffffffe0007 nowgotcount 0xa nowgotstate 0
    ino 0x140bb3 oldicurpos 1 oldleafnr 2 oldleaf 0xfffffc00f0609a00
    ino 0x140bb3 crapicurpos 2 crapleafnr 2 crapleaf 0xfffffc00f0609a00
    ino 0x140bb3 freshicurpos 1 freshleafnr 2 freshleaf 0xfffffc00f0609a00
    ino 0x140bb3 newicurpos 1 newleafnr 3 newleaf 0xfffffc00f0609a00

The first line shows that xfs_bmapi_reserve_delalloc was called with
whichfork=XFS_DATA_FORK, off=0xe6, len=0x4, prealloc=6.

The second line ("oldgot") shows the contents of @got at the beginning
of the call, which are the results of the first iext lookup in
xfs_buffered_write_iomap_begin.

Line 3 ("crapgot") is the result of duplicating the cursor at the start
of the body of xfs_bmapi_reserve_delalloc and performing a fresh lookup
at @off.

Line 4 ("freshgot") is the result of a new xfs_iext_get_extent right
before the call to xfs_bmap_add_extent_hole_delay.  Totally garbage.

Line 5 ("nowgot") is contents of @got after the
xfs_bmap_add_extent_hole_delay call.

Line 6 is the contents of @icur at the beginning fo the call.  Lines 7-9
are the contents of the iext cursors at the point where the block
mappings were sampled.

I think @oldgot is a HOLESTARTBLOCK extent because the first lookup
didn't find anything, so we filled in imap with "fake hole until the
end".  At the time of the first lookup, I suspect that there's only one
32-block unwritten extent in the mapping (hence oldicurpos==1) but by
the time we get to recording crapgot, crapicurpos==2.

Dave then added:

Ok, that's much simpler to reason about, and implies the smoke is
coming from xfs_buffered_write_iomap_begin() or
xfs_bmapi_reserve_delalloc(). I suspect the former - it does a lot
of stuff with the ILOCK_EXCL held.....

.... including calling xfs_qm_dqattach_locked().

xfs_buffered_write_iomap_begin
  ILOCK_EXCL
  look up icur
  xfs_qm_dqattach_locked
    xfs_qm_dqattach_one
      xfs_qm_dqget_inode
        dquot cache miss
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
        error = xfs_qm_dqread(mp, id, type, can_alloc, &dqp);
        xfs_ilock(ip, XFS_ILOCK_EXCL);
  ....
  xfs_bmapi_reserve_delalloc(icur)

Yup, that's what is letting the magic smoke out -
xfs_qm_dqattach_locked() can cycle the ILOCK. If that happens, we
can pass a stale icur to xfs_bmapi_reserve_delalloc() and it all
goes downhill from there.

Back to Darrick now:

So.  Fix this by moving the dqattach_locked call up before we take the
ILOCK, like all the other callers in that file.

Fixes: a526c85c2236 ("xfs: move xfs_file_iomap_begin_delay around") # goes further back than this
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_iomap.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 1005f1e36545..68436370927d 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -978,6 +978,10 @@ xfs_buffered_write_iomap_begin(
 
 	ASSERT(!XFS_IS_REALTIME_INODE(ip));
 
+	error = xfs_qm_dqattach(ip);
+	if (error)
+		return error;
+
 	error = xfs_ilock_for_iomap(ip, flags, &lockmode);
 	if (error)
 		return error;
@@ -1081,10 +1085,6 @@ xfs_buffered_write_iomap_begin(
 			allocfork = XFS_COW_FORK;
 	}
 
-	error = xfs_qm_dqattach_locked(ip, false);
-	if (error)
-		goto out_unlock;
-
 	if (eof && offset + count > XFS_ISIZE(ip)) {
 		/*
 		 * Determine the initial size of the preallocation.
-- 
cgit 


From 1eb52a6a71981b80f9acbd915acd6a05a5037196 Mon Sep 17 00:00:00 2001
From: Guo Xuenan <guoxuenan@huawei.com>
Date: Wed, 30 Nov 2022 09:25:46 -0800
Subject: xfs: wait iclog complete before tearing down AIL

Fix uaf in xfs_trans_ail_delete during xlog force shutdown.
In commit cd6f79d1fb32 ("xfs: run callbacks before waking waiters in
xlog_state_shutdown_callbacks") changed the order of running callbacks
and wait for iclog completion to avoid unmount path untimely destroy AIL.
But which seems not enough to ensue this, adding mdelay in
`xfs_buf_item_unpin` can prove that.

The reproduction is as follows. To ensure destroy AIL safely,
we should wait all xlog ioend workers done and sync the AIL.

==================================================================
BUG: KASAN: use-after-free in xfs_trans_ail_delete+0x240/0x2a0
Read of size 8 at addr ffff888023169400 by task kworker/1:1H/43

CPU: 1 PID: 43 Comm: kworker/1:1H Tainted: G        W
6.1.0-rc1-00002-gc28266863c4a #137
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
1.13.0-1ubuntu1.1 04/01/2014
Workqueue: xfs-log/sda xlog_ioend_work
Call Trace:
 <TASK>
 dump_stack_lvl+0x4d/0x66
 print_report+0x171/0x4a6
 kasan_report+0xb3/0x130
 xfs_trans_ail_delete+0x240/0x2a0
 xfs_buf_item_done+0x7b/0xa0
 xfs_buf_ioend+0x1e9/0x11f0
 xfs_buf_item_unpin+0x4c8/0x860
 xfs_trans_committed_bulk+0x4c2/0x7c0
 xlog_cil_committed+0xab6/0xfb0
 xlog_cil_process_committed+0x117/0x1e0
 xlog_state_shutdown_callbacks+0x208/0x440
 xlog_force_shutdown+0x1b3/0x3a0
 xlog_ioend_work+0xef/0x1d0
 process_one_work+0x6f9/0xf70
 worker_thread+0x578/0xf30
 kthread+0x28c/0x330
 ret_from_fork+0x1f/0x30
 </TASK>

Allocated by task 9606:
 kasan_save_stack+0x1e/0x40
 kasan_set_track+0x21/0x30
 __kasan_kmalloc+0x7a/0x90
 __kmalloc+0x59/0x140
 kmem_alloc+0xb2/0x2f0
 xfs_trans_ail_init+0x20/0x320
 xfs_log_mount+0x37e/0x690
 xfs_mountfs+0xe36/0x1b40
 xfs_fs_fill_super+0xc5c/0x1a70
 get_tree_bdev+0x3c5/0x6c0
 vfs_get_tree+0x85/0x250
 path_mount+0xec3/0x1830
 do_mount+0xef/0x110
 __x64_sys_mount+0x150/0x1f0
 do_syscall_64+0x35/0x80
 entry_SYSCALL_64_after_hwframe+0x63/0xcd

Freed by task 9662:
 kasan_save_stack+0x1e/0x40
 kasan_set_track+0x21/0x30
 kasan_save_free_info+0x2a/0x40
 __kasan_slab_free+0x105/0x1a0
 __kmem_cache_free+0x99/0x2d0
 kvfree+0x3a/0x40
 xfs_log_unmount+0x60/0xf0
 xfs_unmountfs+0xf3/0x1d0
 xfs_fs_put_super+0x78/0x300
 generic_shutdown_super+0x151/0x400
 kill_block_super+0x9a/0xe0
 deactivate_locked_super+0x82/0xe0
 deactivate_super+0x91/0xb0
 cleanup_mnt+0x32a/0x4a0
 task_work_run+0x15f/0x240
 exit_to_user_mode_prepare+0x188/0x190
 syscall_exit_to_user_mode+0x12/0x30
 do_syscall_64+0x42/0x80
 entry_SYSCALL_64_after_hwframe+0x63/0xcd

The buggy address belongs to the object at ffff888023169400
 which belongs to the cache kmalloc-128 of size 128
The buggy address is located 0 bytes inside of
 128-byte region [ffff888023169400, ffff888023169480)

The buggy address belongs to the physical page:
page:ffffea00008c5a00 refcount:1 mapcount:0 mapping:0000000000000000
index:0xffff888023168f80 pfn:0x23168
head:ffffea00008c5a00 order:1 compound_mapcount:0 compound_pincount:0
flags: 0x1fffff80010200(slab|head|node=0|zone=1|lastcpupid=0x1fffff)
raw: 001fffff80010200 ffffea00006b3988 ffffea0000577a88 ffff88800f842ac0
raw: ffff888023168f80 0000000000150007 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff888023169300: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
 ffff888023169380: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>ffff888023169400: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                   ^
 ffff888023169480: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
 ffff888023169500: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
==================================================================
Disabling lock debugging due to kernel taint

Fixes: cd6f79d1fb32 ("xfs: run callbacks before waking waiters in xlog_state_shutdown_callbacks")
Signed-off-by: Guo Xuenan <guoxuenan@huawei.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_log.c | 36 +++++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 0141d9907d31..fc61cc024023 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -888,6 +888,23 @@ xlog_force_iclog(
 	return xlog_state_release_iclog(iclog->ic_log, iclog, NULL);
 }
 
+/*
+ * Cycle all the iclogbuf locks to make sure all log IO completion
+ * is done before we tear down these buffers.
+ */
+static void
+xlog_wait_iclog_completion(struct xlog *log)
+{
+	int		i;
+	struct xlog_in_core	*iclog = log->l_iclog;
+
+	for (i = 0; i < log->l_iclog_bufs; i++) {
+		down(&iclog->ic_sema);
+		up(&iclog->ic_sema);
+		iclog = iclog->ic_next;
+	}
+}
+
 /*
  * Wait for the iclog and all prior iclogs to be written disk as required by the
  * log force state machine. Waiting on ic_force_wait ensures iclog completions
@@ -1113,6 +1130,14 @@ xfs_log_unmount(
 {
 	xfs_log_clean(mp);
 
+	/*
+	 * If shutdown has come from iclog IO context, the log
+	 * cleaning will have been skipped and so we need to wait
+	 * for the iclog to complete shutdown processing before we
+	 * tear anything down.
+	 */
+	xlog_wait_iclog_completion(mp->m_log);
+
 	xfs_buftarg_drain(mp->m_ddev_targp);
 
 	xfs_trans_ail_destroy(mp);
@@ -2115,17 +2140,6 @@ xlog_dealloc_log(
 	xlog_in_core_t	*iclog, *next_iclog;
 	int		i;
 
-	/*
-	 * Cycle all the iclogbuf locks to make sure all log IO completion
-	 * is done before we tear down these buffers.
-	 */
-	iclog = log->l_iclog;
-	for (i = 0; i < log->l_iclog_bufs; i++) {
-		down(&iclog->ic_sema);
-		up(&iclog->ic_sema);
-		iclog = iclog->ic_next;
-	}
-
 	/*
 	 * Destroy the CIL after waiting for iclog IO completion because an
 	 * iclog EIO error will try to shut down the log, which accesses the
-- 
cgit 


From 575689fc0ffa6c4bb4e72fd18e31a6525a6124e0 Mon Sep 17 00:00:00 2001
From: Guo Xuenan <guoxuenan@huawei.com>
Date: Wed, 30 Nov 2022 09:25:46 -0800
Subject: xfs: fix super block buf log item UAF during force shutdown

xfs log io error will trigger xlog shut down, and end_io worker call
xlog_state_shutdown_callbacks to unpin and release the buf log item.
The race condition is that when there are some thread doing transaction
commit and happened not to be intercepted by xlog_is_shutdown, then,
these log item will be insert into CIL, when unpin and release these
buf log item, UAF will occur. BTW, add delay before `xlog_cil_commit`
can increase recurrence probability.

The following call graph actually encountered this bad situation.
fsstress                    io end worker kworker/0:1H-216
                            xlog_ioend_work
                              ->xlog_force_shutdown
                                ->xlog_state_shutdown_callbacks
                                  ->xlog_cil_process_committed
                                    ->xlog_cil_committed
                                      ->xfs_trans_committed_bulk
->xfs_trans_apply_sb_deltas             ->li_ops->iop_unpin(lip, 1);
  ->xfs_trans_getsb
    ->_xfs_trans_bjoin
      ->xfs_buf_item_init
        ->if (bip) { return 0;} //relog
->xlog_cil_commit
  ->xlog_cil_insert_items //insert into CIL
                                           ->xfs_buf_ioend_fail(bp);
                                             ->xfs_buf_ioend
                                               ->xfs_buf_item_done
                                                 ->xfs_buf_item_relse
                                                   ->xfs_buf_item_free

when cil push worker gather percpu cil and insert super block buf log item
into ctx->log_items then uaf occurs.

==================================================================
BUG: KASAN: use-after-free in xlog_cil_push_work+0x1c8f/0x22f0
Write of size 8 at addr ffff88801800f3f0 by task kworker/u4:4/105

CPU: 0 PID: 105 Comm: kworker/u4:4 Tainted: G W
6.1.0-rc1-00001-g274115149b42 #136
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
1.13.0-1ubuntu1.1 04/01/2014
Workqueue: xfs-cil/sda xlog_cil_push_work
Call Trace:
 <TASK>
 dump_stack_lvl+0x4d/0x66
 print_report+0x171/0x4a6
 kasan_report+0xb3/0x130
 xlog_cil_push_work+0x1c8f/0x22f0
 process_one_work+0x6f9/0xf70
 worker_thread+0x578/0xf30
 kthread+0x28c/0x330
 ret_from_fork+0x1f/0x30
 </TASK>

Allocated by task 2145:
 kasan_save_stack+0x1e/0x40
 kasan_set_track+0x21/0x30
 __kasan_slab_alloc+0x54/0x60
 kmem_cache_alloc+0x14a/0x510
 xfs_buf_item_init+0x160/0x6d0
 _xfs_trans_bjoin+0x7f/0x2e0
 xfs_trans_getsb+0xb6/0x3f0
 xfs_trans_apply_sb_deltas+0x1f/0x8c0
 __xfs_trans_commit+0xa25/0xe10
 xfs_symlink+0xe23/0x1660
 xfs_vn_symlink+0x157/0x280
 vfs_symlink+0x491/0x790
 do_symlinkat+0x128/0x220
 __x64_sys_symlink+0x7a/0x90
 do_syscall_64+0x35/0x80
 entry_SYSCALL_64_after_hwframe+0x63/0xcd

Freed by task 216:
 kasan_save_stack+0x1e/0x40
 kasan_set_track+0x21/0x30
 kasan_save_free_info+0x2a/0x40
 __kasan_slab_free+0x105/0x1a0
 kmem_cache_free+0xb6/0x460
 xfs_buf_ioend+0x1e9/0x11f0
 xfs_buf_item_unpin+0x3d6/0x840
 xfs_trans_committed_bulk+0x4c2/0x7c0
 xlog_cil_committed+0xab6/0xfb0
 xlog_cil_process_committed+0x117/0x1e0
 xlog_state_shutdown_callbacks+0x208/0x440
 xlog_force_shutdown+0x1b3/0x3a0
 xlog_ioend_work+0xef/0x1d0
 process_one_work+0x6f9/0xf70
 worker_thread+0x578/0xf30
 kthread+0x28c/0x330
 ret_from_fork+0x1f/0x30

The buggy address belongs to the object at ffff88801800f388
 which belongs to the cache xfs_buf_item of size 272
The buggy address is located 104 bytes inside of
 272-byte region [ffff88801800f388, ffff88801800f498)

The buggy address belongs to the physical page:
page:ffffea0000600380 refcount:1 mapcount:0 mapping:0000000000000000
index:0xffff88801800f208 pfn:0x1800e
head:ffffea0000600380 order:1 compound_mapcount:0 compound_pincount:0
flags: 0x1fffff80010200(slab|head|node=0|zone=1|lastcpupid=0x1fffff)
raw: 001fffff80010200 ffffea0000699788 ffff88801319db50 ffff88800fb50640
raw: ffff88801800f208 000000000015000a 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff88801800f280: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff88801800f300: fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc fc
>ffff88801800f380: fc fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                                                             ^
 ffff88801800f400: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff88801800f480: fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc fc
==================================================================
Disabling lock debugging due to kernel taint

Signed-off-by: Guo Xuenan <guoxuenan@huawei.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_buf_item.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 522d450a94b1..df7322ed73fa 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -1018,6 +1018,8 @@ xfs_buf_item_relse(
 	trace_xfs_buf_item_relse(bp, _RET_IP_);
 	ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags));
 
+	if (atomic_read(&bip->bli_refcount))
+		return;
 	bp->b_log_item = NULL;
 	xfs_buf_rele(bp);
 	xfs_buf_item_free(bip);
-- 
cgit 


From ef4d3ea40565a781c25847e9cb96c1bd9f462bc6 Mon Sep 17 00:00:00 2001
From: Marc Dionne <marc.dionne@auristor.com>
Date: Wed, 30 Nov 2022 17:55:51 +0000
Subject: afs: Fix server->active leak in afs_put_server

The atomic_read was accidentally replaced with atomic_inc_return,
which prevents the server from getting cleaned up and causes rmmod
to hang with a warning:

    Can't purge s=00000001

Fixes: 2757a4dc1849 ("afs: Fix access after dec in put functions")
Signed-off-by: Marc Dionne <marc.dionne@auristor.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20221130174053.2665818-1-marc.dionne@auristor.com/
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/afs/server.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/afs/server.c b/fs/afs/server.c
index 4981baf97835..b5237206eac3 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -406,7 +406,7 @@ void afs_put_server(struct afs_net *net, struct afs_server *server,
 	if (!server)
 		return;
 
-	a = atomic_inc_return(&server->active);
+	a = atomic_read(&server->active);
 	zero = __refcount_dec_and_test(&server->ref, &r);
 	trace_afs_server(debug_id, r - 1, a, reason);
 	if (unlikely(zero))
-- 
cgit 


From f0a0ccda18d6fd826d7c7e7ad48a6ed61c20f8b4 Mon Sep 17 00:00:00 2001
From: ZhangPeng <zhangpeng362@huawei.com>
Date: Sat, 19 Nov 2022 21:05:42 +0900
Subject: nilfs2: fix NULL pointer dereference in
 nilfs_palloc_commit_free_entry()

Syzbot reported a null-ptr-deref bug:

 NILFS (loop0): segctord starting. Construction interval = 5 seconds, CP
 frequency < 30 seconds
 general protection fault, probably for non-canonical address
 0xdffffc0000000002: 0000 [#1] PREEMPT SMP KASAN
 KASAN: null-ptr-deref in range [0x0000000000000010-0x0000000000000017]
 CPU: 1 PID: 3603 Comm: segctord Not tainted
 6.1.0-rc2-syzkaller-00105-gb229b6ca5abb #0
 Hardware name: Google Compute Engine/Google Compute Engine, BIOS Google
 10/11/2022
 RIP: 0010:nilfs_palloc_commit_free_entry+0xe5/0x6b0
 fs/nilfs2/alloc.c:608
 Code: 00 00 00 00 fc ff df 80 3c 02 00 0f 85 cd 05 00 00 48 b8 00 00 00
 00 00 fc ff df 4c 8b 73 08 49 8d 7e 10 48 89 fa 48 c1 ea 03 <80> 3c 02
 00 0f 85 26 05 00 00 49 8b 46 10 be a6 00 00 00 48 c7 c7
 RSP: 0018:ffffc90003dff830 EFLAGS: 00010212
 RAX: dffffc0000000000 RBX: ffff88802594e218 RCX: 000000000000000d
 RDX: 0000000000000002 RSI: 0000000000002000 RDI: 0000000000000010
 RBP: ffff888071880222 R08: 0000000000000005 R09: 000000000000003f
 R10: 000000000000000d R11: 0000000000000000 R12: ffff888071880158
 R13: ffff88802594e220 R14: 0000000000000000 R15: 0000000000000004
 FS:  0000000000000000(0000) GS:ffff8880b9b00000(0000)
 knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 00007fb1c08316a8 CR3: 0000000018560000 CR4: 0000000000350ee0
 Call Trace:
  <TASK>
  nilfs_dat_commit_free fs/nilfs2/dat.c:114 [inline]
  nilfs_dat_commit_end+0x464/0x5f0 fs/nilfs2/dat.c:193
  nilfs_dat_commit_update+0x26/0x40 fs/nilfs2/dat.c:236
  nilfs_btree_commit_update_v+0x87/0x4a0 fs/nilfs2/btree.c:1940
  nilfs_btree_commit_propagate_v fs/nilfs2/btree.c:2016 [inline]
  nilfs_btree_propagate_v fs/nilfs2/btree.c:2046 [inline]
  nilfs_btree_propagate+0xa00/0xd60 fs/nilfs2/btree.c:2088
  nilfs_bmap_propagate+0x73/0x170 fs/nilfs2/bmap.c:337
  nilfs_collect_file_data+0x45/0xd0 fs/nilfs2/segment.c:568
  nilfs_segctor_apply_buffers+0x14a/0x470 fs/nilfs2/segment.c:1018
  nilfs_segctor_scan_file+0x3f4/0x6f0 fs/nilfs2/segment.c:1067
  nilfs_segctor_collect_blocks fs/nilfs2/segment.c:1197 [inline]
  nilfs_segctor_collect fs/nilfs2/segment.c:1503 [inline]
  nilfs_segctor_do_construct+0x12fc/0x6af0 fs/nilfs2/segment.c:2045
  nilfs_segctor_construct+0x8e3/0xb30 fs/nilfs2/segment.c:2379
  nilfs_segctor_thread_construct fs/nilfs2/segment.c:2487 [inline]
  nilfs_segctor_thread+0x3c3/0xf30 fs/nilfs2/segment.c:2570
  kthread+0x2e4/0x3a0 kernel/kthread.c:376
  ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:306
  </TASK>
 ...

If DAT metadata file is corrupted on disk, there is a case where
req->pr_desc_bh is NULL and blocknr is 0 at nilfs_dat_commit_end() during
a b-tree operation that cascadingly updates ancestor nodes of the b-tree,
because nilfs_dat_commit_alloc() for a lower level block can initialize
the blocknr on the same DAT entry between nilfs_dat_prepare_end() and
nilfs_dat_commit_end().

If this happens, nilfs_dat_commit_end() calls nilfs_dat_commit_free()
without valid buffer heads in req->pr_desc_bh and req->pr_bitmap_bh, and
causes the NULL pointer dereference above in
nilfs_palloc_commit_free_entry() function, which leads to a crash.

Fix this by adding a NULL check on req->pr_desc_bh and req->pr_bitmap_bh
before nilfs_palloc_commit_free_entry() in nilfs_dat_commit_free().

This also calls nilfs_error() in that case to notify that there is a fatal
flaw in the filesystem metadata and prevent further operations.

Link: https://lkml.kernel.org/r/00000000000097c20205ebaea3d6@google.com
Link: https://lkml.kernel.org/r/20221114040441.1649940-1-zhangpeng362@huawei.com
Link: https://lkml.kernel.org/r/20221119120542.17204-1-konishi.ryusuke@gmail.com
Signed-off-by: ZhangPeng <zhangpeng362@huawei.com>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Reported-by: syzbot+ebe05ee8e98f755f61d0@syzkaller.appspotmail.com
Tested-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/dat.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 3b55e239705f..9930fa901039 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -111,6 +111,13 @@ static void nilfs_dat_commit_free(struct inode *dat,
 	kunmap_atomic(kaddr);
 
 	nilfs_dat_commit_entry(dat, req);
+
+	if (unlikely(req->pr_desc_bh == NULL || req->pr_bitmap_bh == NULL)) {
+		nilfs_error(dat->i_sb,
+			    "state inconsistency probably due to duplicate use of vblocknr = %llu",
+			    (unsigned long long)req->pr_entry_nr);
+		return;
+	}
 	nilfs_palloc_commit_free_entry(dat, req);
 }
 
-- 
cgit 


From d09e8ca6cb93bb4b97517a18fbbf7eccb0e9ff43 Mon Sep 17 00:00:00 2001
From: Pasha Tatashin <pasha.tatashin@soleen.com>
Date: Tue, 15 Nov 2022 02:06:01 +0000
Subject: mm: anonymous shared memory naming

Since commit 9a10064f5625 ("mm: add a field to store names for private
anonymous memory"), name for private anonymous memory, but not shared
anonymous, can be set.  However, naming shared anonymous memory just as
useful for tracking purposes.

Extend the functionality to be able to set names for shared anon.

There are two ways to create anonymous shared memory, using memfd or
directly via mmap():
1. fd = memfd_create(...)
   mem = mmap(..., MAP_SHARED, fd, ...)
2. mem = mmap(..., MAP_SHARED | MAP_ANONYMOUS, -1, ...)

In both cases the anonymous shared memory is created the same way by
mapping an unlinked file on tmpfs.

The memfd way allows to give a name for anonymous shared memory, but
not useful when parts of shared memory require to have distinct names.

Example use case: The VMM maps VM memory as anonymous shared memory (not
private because VMM is sandboxed and drivers are running in their own
processes).  However, the VM tells back to the VMM how parts of the memory
are actually used by the guest, how each of the segments should be backed
(i.e.  4K pages, 2M pages), and some other information about the segments.
The naming allows us to monitor the effective memory footprint for each
of these segments from the host without looking inside the guest.

Sample output:
  /* Create shared anonymous segmenet */
  anon_shmem = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,
                    MAP_SHARED | MAP_ANONYMOUS, -1, 0);
  /* Name the segment: "MY-NAME" */
  rv = prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME,
             anon_shmem, SIZE, "MY-NAME");

cat /proc/<pid>/maps (and smaps):
7fc8e2b4c000-7fc8f2b4c000 rw-s 00000000 00:01 1024 [anon_shmem:MY-NAME]

If the segment is not named, the output is:
7fc8e2b4c000-7fc8f2b4c000 rw-s 00000000 00:01 1024 /dev/zero (deleted)

Link: https://lkml.kernel.org/r/20221115020602.804224-1-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bagas Sanjaya <bagasdotme@gmail.com>
Cc: Colin Cross <ccross@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: "Kirill A . Shutemov" <kirill@shutemov.name>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Sean Christopherson <seanjc@google.com>
Cc: Vincent Whitchurch <vincent.whitchurch@axis.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: xu xin <cgel.zte@gmail.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/task_mmu.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 8a74cdcc9af0..89338950afd3 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -274,6 +274,7 @@ static void show_vma_header_prefix(struct seq_file *m,
 static void
 show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
 {
+	struct anon_vma_name *anon_name = NULL;
 	struct mm_struct *mm = vma->vm_mm;
 	struct file *file = vma->vm_file;
 	vm_flags_t flags = vma->vm_flags;
@@ -293,6 +294,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
 	start = vma->vm_start;
 	end = vma->vm_end;
 	show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino);
+	if (mm)
+		anon_name = anon_vma_name(vma);
 
 	/*
 	 * Print the dentry name for named mappings, and a
@@ -300,7 +303,14 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
 	 */
 	if (file) {
 		seq_pad(m, ' ');
-		seq_file_path(m, file, "\n");
+		/*
+		 * If user named this anon shared memory via
+		 * prctl(PR_SET_VMA ..., use the provided name.
+		 */
+		if (anon_name)
+			seq_printf(m, "[anon_shmem:%s]", anon_name->name);
+		else
+			seq_file_path(m, file, "\n");
 		goto done;
 	}
 
@@ -312,8 +322,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
 
 	name = arch_vma_name(vma);
 	if (!name) {
-		struct anon_vma_name *anon_name;
-
 		if (!mm) {
 			name = "[vdso]";
 			goto done;
@@ -330,7 +338,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
 			goto done;
 		}
 
-		anon_name = anon_vma_name(vma);
 		if (anon_name) {
 			seq_pad(m, ' ');
 			seq_printf(m, "[anon:%s]", anon_name->name);
-- 
cgit 


From dbaf7dc97ab8d526a20d3477419bc14b4890a82c Mon Sep 17 00:00:00 2001
From: Li zeming <zeming@nfschina.com>
Date: Mon, 7 Nov 2022 09:56:59 +0800
Subject: hugetlbfs: inode: remove unnecessary (void*) conversions

The ei pointer does not need to cast the type.

Link: https://lkml.kernel.org/r/20221107015659.3221-1-zeming@nfschina.com
Signed-off-by: Li zeming <zeming@nfschina.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/hugetlbfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 3ee84604e36d..790d2727141a 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1279,7 +1279,7 @@ static const struct address_space_operations hugetlbfs_aops = {
 
 static void init_once(void *foo)
 {
-	struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
+	struct hugetlbfs_inode_info *ei = foo;
 
 	inode_init_once(&ei->vfs_inode);
 }
-- 
cgit 


From 6dd8fe86fa84729538d8bed3149faf9c5886bb5b Mon Sep 17 00:00:00 2001
From: "Vishal Moola (Oracle)" <vishal.moola@gmail.com>
Date: Thu, 17 Nov 2022 23:30:52 -0800
Subject: ext4: convert move_extent_per_page() to use folios

Patch series "Removing the try_to_release_page() wrapper", v3.

This patchset replaces the remaining calls of try_to_release_page() with
the folio equivalent: filemap_release_folio().  This allows us to remove
the wrapper.


This patch (of 4):

Convert move_extent_per_page() to use folios.  This change removes 5 calls
to compound_head() and is in preparation for the removal of the
try_to_release_page() wrapper.

Link: https://lkml.kernel.org/r/20221118073055.55694-1-vishal.moola@gmail.com
Link: https://lkml.kernel.org/r/20221118073055.55694-2-vishal.moola@gmail.com
Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ext4/move_extent.c | 52 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 044e34cd835c..8dbb87edf24c 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -253,6 +253,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
 {
 	struct inode *orig_inode = file_inode(o_filp);
 	struct page *pagep[2] = {NULL, NULL};
+	struct folio *folio[2] = {NULL, NULL};
 	handle_t *handle;
 	ext4_lblk_t orig_blk_offset, donor_blk_offset;
 	unsigned long blocksize = orig_inode->i_sb->s_blocksize;
@@ -313,6 +314,13 @@ again:
 	 * hold page's lock, if it is still the case data copy is not
 	 * necessary, just swap data blocks between orig and donor.
 	 */
+	folio[0] = page_folio(pagep[0]);
+	folio[1] = page_folio(pagep[1]);
+
+	VM_BUG_ON_FOLIO(folio_test_large(folio[0]), folio[0]);
+	VM_BUG_ON_FOLIO(folio_test_large(folio[1]), folio[1]);
+	VM_BUG_ON_FOLIO(folio_nr_pages(folio[0]) != folio_nr_pages(folio[1]), folio[1]);
+
 	if (unwritten) {
 		ext4_double_down_write_data_sem(orig_inode, donor_inode);
 		/* If any of extents in range became initialized we have to
@@ -331,10 +339,10 @@ again:
 			ext4_double_up_write_data_sem(orig_inode, donor_inode);
 			goto data_copy;
 		}
-		if ((page_has_private(pagep[0]) &&
-		     !try_to_release_page(pagep[0], 0)) ||
-		    (page_has_private(pagep[1]) &&
-		     !try_to_release_page(pagep[1], 0))) {
+		if ((folio_has_private(folio[0]) &&
+		     !filemap_release_folio(folio[0], 0)) ||
+		    (folio_has_private(folio[1]) &&
+		     !filemap_release_folio(folio[1], 0))) {
 			*err = -EBUSY;
 			goto drop_data_sem;
 		}
@@ -344,19 +352,21 @@ again:
 						   block_len_in_page, 1, err);
 	drop_data_sem:
 		ext4_double_up_write_data_sem(orig_inode, donor_inode);
-		goto unlock_pages;
+		goto unlock_folios;
 	}
 data_copy:
-	*err = mext_page_mkuptodate(pagep[0], from, from + replaced_size);
+	*err = mext_page_mkuptodate(&folio[0]->page, from, from + replaced_size);
 	if (*err)
-		goto unlock_pages;
+		goto unlock_folios;
 
 	/* At this point all buffers in range are uptodate, old mapping layout
 	 * is no longer required, try to drop it now. */
-	if ((page_has_private(pagep[0]) && !try_to_release_page(pagep[0], 0)) ||
-	    (page_has_private(pagep[1]) && !try_to_release_page(pagep[1], 0))) {
+	if ((folio_has_private(folio[0]) &&
+		!filemap_release_folio(folio[0], 0)) ||
+	    (folio_has_private(folio[1]) &&
+		!filemap_release_folio(folio[1], 0))) {
 		*err = -EBUSY;
-		goto unlock_pages;
+		goto unlock_folios;
 	}
 	ext4_double_down_write_data_sem(orig_inode, donor_inode);
 	replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode,
@@ -369,13 +379,13 @@ data_copy:
 			replaced_size =
 				block_len_in_page << orig_inode->i_blkbits;
 		} else
-			goto unlock_pages;
+			goto unlock_folios;
 	}
 	/* Perform all necessary steps similar write_begin()/write_end()
 	 * but keeping in mind that i_size will not change */
-	if (!page_has_buffers(pagep[0]))
-		create_empty_buffers(pagep[0], 1 << orig_inode->i_blkbits, 0);
-	bh = page_buffers(pagep[0]);
+	if (!folio_buffers(folio[0]))
+		create_empty_buffers(&folio[0]->page, 1 << orig_inode->i_blkbits, 0);
+	bh = folio_buffers(folio[0]);
 	for (i = 0; i < data_offset_in_page; i++)
 		bh = bh->b_this_page;
 	for (i = 0; i < block_len_in_page; i++) {
@@ -385,7 +395,7 @@ data_copy:
 		bh = bh->b_this_page;
 	}
 	if (!*err)
-		*err = block_commit_write(pagep[0], from, from + replaced_size);
+		*err = block_commit_write(&folio[0]->page, from, from + replaced_size);
 
 	if (unlikely(*err < 0))
 		goto repair_branches;
@@ -395,11 +405,11 @@ data_copy:
 	*err = ext4_jbd2_inode_add_write(handle, orig_inode,
 			(loff_t)orig_page_offset << PAGE_SHIFT, replaced_size);
 
-unlock_pages:
-	unlock_page(pagep[0]);
-	put_page(pagep[0]);
-	unlock_page(pagep[1]);
-	put_page(pagep[1]);
+unlock_folios:
+	folio_unlock(folio[0]);
+	folio_put(folio[0]);
+	folio_unlock(folio[1]);
+	folio_put(folio[1]);
 stop_journal:
 	ext4_journal_stop(handle);
 	if (*err == -ENOSPC &&
@@ -430,7 +440,7 @@ repair_branches:
 		*err = -EIO;
 	}
 	replaced_count = 0;
-	goto unlock_pages;
+	goto unlock_folios;
 }
 
 /**
-- 
cgit 


From 2e41f274f9aa71cdcc69dc1f26a3f9304a651804 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Tue, 20 Sep 2022 02:24:16 +0900
Subject: libfs: add DEFINE_SIMPLE_ATTRIBUTE_SIGNED for signed value

Patch series "fix error when writing negative value to simple attribute
files".

The simple attribute files do not accept a negative value since the commit
488dac0c9237 ("libfs: fix error cast of negative value in
simple_attr_write()"), but some attribute files want to accept a negative
value.


This patch (of 3):

The simple attribute files do not accept a negative value since the commit
488dac0c9237 ("libfs: fix error cast of negative value in
simple_attr_write()"), so we have to use a 64-bit value to write a
negative value.

This adds DEFINE_SIMPLE_ATTRIBUTE_SIGNED for a signed value.

Link: https://lkml.kernel.org/r/20220919172418.45257-1-akinobu.mita@gmail.com
Link: https://lkml.kernel.org/r/20220919172418.45257-2-akinobu.mita@gmail.com
Fixes: 488dac0c9237 ("libfs: fix error cast of negative value in simple_attr_write()")
Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Reported-by: Zhao Gongyi <zhaogongyi@huawei.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Rafael J. Wysocki <rafael@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Wei Yongjun <weiyongjun1@huawei.com>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/libfs.c | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/libfs.c b/fs/libfs.c
index 682d56345a1c..aada4e7c8713 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -995,8 +995,8 @@ out:
 EXPORT_SYMBOL_GPL(simple_attr_read);
 
 /* interpret the buffer as a number to call the set function with */
-ssize_t simple_attr_write(struct file *file, const char __user *buf,
-			  size_t len, loff_t *ppos)
+static ssize_t simple_attr_write_xsigned(struct file *file, const char __user *buf,
+			  size_t len, loff_t *ppos, bool is_signed)
 {
 	struct simple_attr *attr;
 	unsigned long long val;
@@ -1017,7 +1017,10 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf,
 		goto out;
 
 	attr->set_buf[size] = '\0';
-	ret = kstrtoull(attr->set_buf, 0, &val);
+	if (is_signed)
+		ret = kstrtoll(attr->set_buf, 0, &val);
+	else
+		ret = kstrtoull(attr->set_buf, 0, &val);
 	if (ret)
 		goto out;
 	ret = attr->set(attr->data, val);
@@ -1027,8 +1030,21 @@ out:
 	mutex_unlock(&attr->mutex);
 	return ret;
 }
+
+ssize_t simple_attr_write(struct file *file, const char __user *buf,
+			  size_t len, loff_t *ppos)
+{
+	return simple_attr_write_xsigned(file, buf, len, ppos, false);
+}
 EXPORT_SYMBOL_GPL(simple_attr_write);
 
+ssize_t simple_attr_write_signed(struct file *file, const char __user *buf,
+			  size_t len, loff_t *ppos)
+{
+	return simple_attr_write_xsigned(file, buf, len, ppos, true);
+}
+EXPORT_SYMBOL_GPL(simple_attr_write_signed);
+
 /**
  * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation
  * @sb:		filesystem to do the file handle conversion on
-- 
cgit 


From d472cf797c4e268613dbce5ec9b95d0bcae19ecb Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Tue, 20 Sep 2022 02:24:18 +0900
Subject: debugfs: fix error when writing negative value to atomic_t debugfs
 file

The simple attribute files do not accept a negative value since the commit
488dac0c9237 ("libfs: fix error cast of negative value in
simple_attr_write()"), so we have to use a 64-bit value to write a
negative value for a debugfs file created by debugfs_create_atomic_t().

This restores the previous behaviour by introducing
DEFINE_DEBUGFS_ATTRIBUTE_SIGNED for a signed value.

Link: https://lkml.kernel.org/r/20220919172418.45257-4-akinobu.mita@gmail.com
Fixes: 488dac0c9237 ("libfs: fix error cast of negative value in simple_attr_write()")
Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Reported-by: Zhao Gongyi <zhaogongyi@huawei.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Rafael J. Wysocki <rafael@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Wei Yongjun <weiyongjun1@huawei.com>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/debugfs/file.c | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index ddb3fc258df9..b54f470e0d03 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -378,8 +378,8 @@ ssize_t debugfs_attr_read(struct file *file, char __user *buf,
 }
 EXPORT_SYMBOL_GPL(debugfs_attr_read);
 
-ssize_t debugfs_attr_write(struct file *file, const char __user *buf,
-			 size_t len, loff_t *ppos)
+static ssize_t debugfs_attr_write_xsigned(struct file *file, const char __user *buf,
+			 size_t len, loff_t *ppos, bool is_signed)
 {
 	struct dentry *dentry = F_DENTRY(file);
 	ssize_t ret;
@@ -387,12 +387,28 @@ ssize_t debugfs_attr_write(struct file *file, const char __user *buf,
 	ret = debugfs_file_get(dentry);
 	if (unlikely(ret))
 		return ret;
-	ret = simple_attr_write(file, buf, len, ppos);
+	if (is_signed)
+		ret = simple_attr_write_signed(file, buf, len, ppos);
+	else
+		ret = simple_attr_write(file, buf, len, ppos);
 	debugfs_file_put(dentry);
 	return ret;
 }
+
+ssize_t debugfs_attr_write(struct file *file, const char __user *buf,
+			 size_t len, loff_t *ppos)
+{
+	return debugfs_attr_write_xsigned(file, buf, len, ppos, false);
+}
 EXPORT_SYMBOL_GPL(debugfs_attr_write);
 
+ssize_t debugfs_attr_write_signed(struct file *file, const char __user *buf,
+			 size_t len, loff_t *ppos)
+{
+	return debugfs_attr_write_xsigned(file, buf, len, ppos, true);
+}
+EXPORT_SYMBOL_GPL(debugfs_attr_write_signed);
+
 static struct dentry *debugfs_create_mode_unsafe(const char *name, umode_t mode,
 					struct dentry *parent, void *value,
 					const struct file_operations *fops,
@@ -738,11 +754,11 @@ static int debugfs_atomic_t_get(void *data, u64 *val)
 	*val = atomic_read((atomic_t *)data);
 	return 0;
 }
-DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t, debugfs_atomic_t_get,
+DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t, debugfs_atomic_t_get,
 			debugfs_atomic_t_set, "%lld\n");
-DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t_ro, debugfs_atomic_t_get, NULL,
+DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t_ro, debugfs_atomic_t_get, NULL,
 			"%lld\n");
-DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set,
+DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t_wo, NULL, debugfs_atomic_t_set,
 			"%lld\n");
 
 /**
-- 
cgit 


From ce2fcf1516d674a174d9b34d1e1024d64de9fba3 Mon Sep 17 00:00:00 2001
From: Li Zetao <ocfs2-devel@oss.oracle.com>
Date: Wed, 9 Nov 2022 15:46:27 +0800
Subject: ocfs2: fix memory leak in ocfs2_mount_volume()

There is a memory leak reported by kmemleak:

  unreferenced object 0xffff88810cc65e60 (size 32):
    comm "mount.ocfs2", pid 23753, jiffies 4302528942 (age 34735.105s)
    hex dump (first 32 bytes):
      10 00 00 00 00 00 00 00 00 01 01 01 01 01 01 01  ................
      01 01 01 01 01 01 01 01 00 00 00 00 00 00 00 00  ................
    backtrace:
      [<ffffffff8170f73d>] __kmalloc+0x4d/0x150
      [<ffffffffa0ac3f51>] ocfs2_compute_replay_slots+0x121/0x330 [ocfs2]
      [<ffffffffa0b65165>] ocfs2_check_volume+0x485/0x900 [ocfs2]
      [<ffffffffa0b68129>] ocfs2_mount_volume.isra.0+0x1e9/0x650 [ocfs2]
      [<ffffffffa0b7160b>] ocfs2_fill_super+0xe0b/0x1740 [ocfs2]
      [<ffffffff818e1fe2>] mount_bdev+0x312/0x400
      [<ffffffff819a086d>] legacy_get_tree+0xed/0x1d0
      [<ffffffff818de82d>] vfs_get_tree+0x7d/0x230
      [<ffffffff81957f92>] path_mount+0xd62/0x1760
      [<ffffffff81958a5a>] do_mount+0xca/0xe0
      [<ffffffff81958d3c>] __x64_sys_mount+0x12c/0x1a0
      [<ffffffff82f26f15>] do_syscall_64+0x35/0x80
      [<ffffffff8300006a>] entry_SYSCALL_64_after_hwframe+0x46/0xb0

This call stack is related to two problems.  Firstly, the ocfs2 super uses
"replay_map" to trace online/offline slots, in order to recover offline
slots during recovery and mount.  But when ocfs2_truncate_log_init()
returns an error in ocfs2_mount_volume(), the memory of "replay_map" will
not be freed in error handling path.  Secondly, the memory of "replay_map"
will not be freed if d_make_root() returns an error in ocfs2_fill_super().
But the memory of "replay_map" will be freed normally when completing
recovery and mount in ocfs2_complete_mount_recovery().

Fix the first problem by adding error handling path to free "replay_map"
when ocfs2_truncate_log_init() fails.  And fix the second problem by
calling ocfs2_free_replay_slots(osb) in the error handling path
"out_dismount".  In addition, since ocfs2_free_replay_slots() is static,
it is necessary to remove its static attribute and declare it in header
file.

Link: https://lkml.kernel.org/r/20221109074627.2303950-1-lizetao1@huawei.com
Fixes: 9140db04ef18 ("ocfs2: recover orphans in offline slots during recovery and mount")
Signed-off-by: Li Zetao <lizetao1@huawei.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Gang He <ghe@suse.com>
Cc: Jun Piao <piaojun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/journal.c | 2 +-
 fs/ocfs2/journal.h | 1 +
 fs/ocfs2/super.c   | 5 ++++-
 3 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 126671e6caed..3fb98b4569a2 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -157,7 +157,7 @@ static void ocfs2_queue_replay_slots(struct ocfs2_super *osb,
 	replay_map->rm_state = REPLAY_DONE;
 }
 
-static void ocfs2_free_replay_slots(struct ocfs2_super *osb)
+void ocfs2_free_replay_slots(struct ocfs2_super *osb)
 {
 	struct ocfs2_replay_map *replay_map = osb->replay_map;
 
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 969d0aa28718..41c382f68529 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -150,6 +150,7 @@ int ocfs2_recovery_init(struct ocfs2_super *osb);
 void ocfs2_recovery_exit(struct ocfs2_super *osb);
 
 int ocfs2_compute_replay_slots(struct ocfs2_super *osb);
+void ocfs2_free_replay_slots(struct ocfs2_super *osb);
 /*
  *  Journal Control:
  *  Initialize, Load, Shutdown, Wipe a journal.
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 42c993e53924..0b0e6a132101 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1159,6 +1159,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 out_dismount:
 	atomic_set(&osb->vol_state, VOLUME_DISABLED);
 	wake_up(&osb->osb_mount_event);
+	ocfs2_free_replay_slots(osb);
 	ocfs2_dismount_volume(sb, 1);
 	goto out;
 
@@ -1822,12 +1823,14 @@ static int ocfs2_mount_volume(struct super_block *sb)
 	status = ocfs2_truncate_log_init(osb);
 	if (status < 0) {
 		mlog_errno(status);
-		goto out_system_inodes;
+		goto out_check_volume;
 	}
 
 	ocfs2_super_unlock(osb, 1);
 	return 0;
 
+out_check_volume:
+	ocfs2_free_replay_slots(osb);
 out_system_inodes:
 	if (osb->local_alloc_state == OCFS2_LA_ENABLED)
 		ocfs2_shutdown_local_alloc(osb);
-- 
cgit 


From 811b99fd237e17230d660c6000021f23ce3f3985 Mon Sep 17 00:00:00 2001
From: Bo Liu <liubo03@inspur.com>
Date: Fri, 11 Nov 2022 02:56:48 -0500
Subject: fat (exportfs): fix some kernel-doc warnings

Fix the following W=1 kernel build warning(s):

  fs/fat/nfs.c:21: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
  fs/fat/nfs.c:139: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst

Link: https://lkml.kernel.org/r/20221111075648.4005-1-liubo03@inspur.com
Signed-off-by: Bo Liu <liubo03@inspur.com>
Acked-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/fat/nfs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c
index af191371c352..3626eb585a98 100644
--- a/fs/fat/nfs.c
+++ b/fs/fat/nfs.c
@@ -17,7 +17,7 @@ struct fat_fid {
 #define FAT_FID_SIZE_WITHOUT_PARENT 3
 #define FAT_FID_SIZE_WITH_PARENT (sizeof(struct fat_fid)/sizeof(u32))
 
-/**
+/*
  * Look up a directory inode given its starting cluster.
  */
 static struct inode *fat_dget(struct super_block *sb, int i_logstart)
@@ -135,7 +135,7 @@ fat_encode_fh_nostale(struct inode *inode, __u32 *fh, int *lenp,
 	return type;
 }
 
-/**
+/*
  * Map a NFS file handle to a corresponding dentry.
  * The dentry may or may not be connected to the filesystem root.
  */
-- 
cgit 


From 63ffb573df66aea034d07fd00483d0a3cd4fed66 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Tue, 22 Nov 2022 03:04:00 +0100
Subject: efi: vars: prohibit reading random seed variables

In anticipation of putting random seeds in EFI variables, it's important
that the random GUID namespace of variables remains hidden from
userspace. We accomplish this by not populating efivarfs with entries
from that GUID, as well as denying the creation of new ones in that
GUID.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 fs/efivarfs/inode.c | 4 ++++
 fs/efivarfs/super.c | 3 +++
 2 files changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index 939e5e242b98..617f3ad2485e 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -91,6 +91,10 @@ static int efivarfs_create(struct user_namespace *mnt_userns, struct inode *dir,
 	err = guid_parse(dentry->d_name.name + namelen + 1, &var->var.VendorGuid);
 	if (err)
 		goto out;
+	if (guid_equal(&var->var.VendorGuid, &LINUX_EFI_RANDOM_SEED_TABLE_GUID)) {
+		err = -EPERM;
+		goto out;
+	}
 
 	if (efivar_variable_is_removable(var->var.VendorGuid,
 					 dentry->d_name.name, namelen))
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 6780fc81cc11..07e82e246666 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -116,6 +116,9 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
 	int err = -ENOMEM;
 	bool is_removable = false;
 
+	if (guid_equal(&vendor, &LINUX_EFI_RANDOM_SEED_TABLE_GUID))
+		return 0;
+
 	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
 	if (!entry)
 		return err;
-- 
cgit 


From eee22187b53611e173161e38f61de1c7ecbeb876 Mon Sep 17 00:00:00 2001
From: Baokun Li <libaokun1@huawei.com>
Date: Wed, 17 Aug 2022 21:27:01 +0800
Subject: ext4: add inode table check in __ext4_get_inode_loc to aovid possible
 infinite loop

In do_writepages, if the value returned by ext4_writepages is "-ENOMEM"
and "wbc->sync_mode == WB_SYNC_ALL", retry until the condition is not met.

In __ext4_get_inode_loc, if the bh returned by sb_getblk is NULL,
the function returns -ENOMEM.

In __getblk_slow, if the return value of grow_buffers is less than 0,
the function returns NULL.

When the three processes are connected in series like the following stack,
an infinite loop may occur:

do_writepages					<--- keep retrying
 ext4_writepages
  mpage_map_and_submit_extent
   mpage_map_one_extent
    ext4_map_blocks
     ext4_ext_map_blocks
      ext4_ext_handle_unwritten_extents
       ext4_ext_convert_to_initialized
        ext4_split_extent
         ext4_split_extent_at
          __ext4_ext_dirty
           __ext4_mark_inode_dirty
            ext4_reserve_inode_write
             ext4_get_inode_loc
              __ext4_get_inode_loc		<--- return -ENOMEM
               sb_getblk
                __getblk_gfp
                 __getblk_slow			<--- return NULL
                  grow_buffers
                   grow_dev_page		<--- return -ENXIO
                    ret = (block < end_block) ? 1 : -ENXIO;

In this issue, bg_inode_table_hi is overwritten as an incorrect value.
As a result, `block < end_block` cannot be met in grow_dev_page.
Therefore, __ext4_get_inode_loc always returns '-ENOMEM' and do_writepages
keeps retrying. As a result, the writeback process is in the D state due
to an infinite loop.

Add a check on inode table block in the __ext4_get_inode_loc function by
referring to ext4_read_inode_bitmap to avoid this infinite loop.

Cc: stable@kernel.org
Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Link: https://lore.kernel.org/r/20220817132701.3015912-3-libaokun1@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inode.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 121ce2fccfab..6d16241666e6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4479,9 +4479,17 @@ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino,
 	inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
 	inode_offset = ((ino - 1) %
 			EXT4_INODES_PER_GROUP(sb));
-	block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
 	iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
 
+	block = ext4_inode_table(sb, gdp);
+	if ((block <= le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) ||
+	    (block >= ext4_blocks_count(EXT4_SB(sb)->s_es))) {
+		ext4_error(sb, "Invalid inode table block %llu in "
+			   "block_group %u", block, iloc->block_group);
+		return -EFSCORRUPTED;
+	}
+	block += (inode_offset / inodes_per_block);
+
 	bh = sb_getblk(sb, block);
 	if (unlikely(!bh))
 		return -ENOMEM;
-- 
cgit 


From 71df9683827a17fb6279fcc2e52efdc7062a03b9 Mon Sep 17 00:00:00 2001
From: Jinpeng Cui <cui.jinpeng2@zte.com.cn>
Date: Wed, 31 Aug 2022 16:08:43 +0000
Subject: ext4: remove redundant variable err

Return value directly from ext4_group_extend_no_check()
instead of getting value from redundant variable err.

Reported-by: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: Jinpeng Cui <cui.jinpeng2@zte.com.cn>
Link: https://lore.kernel.org/r/20220831160843.305836-1-cui.jinpeng2@zte.com.cn
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/resize.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 46b87ffeb304..6e88a85abd7f 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1831,7 +1831,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 	ext4_grpblk_t last;
 	ext4_grpblk_t add;
 	struct buffer_head *bh;
-	int err;
 	ext4_group_t group;
 
 	o_blocks_count = ext4_blocks_count(es);
@@ -1886,8 +1885,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 	}
 	brelse(bh);
 
-	err = ext4_group_extend_no_check(sb, o_blocks_count, add);
-	return err;
+	return ext4_group_extend_no_check(sb, o_blocks_count, add);
 } /* ext4_group_extend */
 
 
-- 
cgit 


From 56d0d0b9289dae041becc7ee6bd966a00dd610e0 Mon Sep 17 00:00:00 2001
From: Li Zhong <floridsleeves@gmail.com>
Date: Fri, 16 Sep 2022 17:28:16 -0700
Subject: ext4: check the return value of ext4_xattr_inode_dec_ref()

Check the return value of ext4_xattr_inode_dec_ref(), which could
return error code and need to be warned.

Signed-off-by: Li Zhong <floridsleeves@gmail.com>
Link: https://lore.kernel.org/r/20220917002816.3804400-1-floridsleeves@gmail.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/xattr.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 36d6ba7190b6..718ef3987f94 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1540,7 +1540,8 @@ static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode,
 
 	err = ext4_xattr_inode_write(handle, ea_inode, value, value_len);
 	if (err) {
-		ext4_xattr_inode_dec_ref(handle, ea_inode);
+		if (ext4_xattr_inode_dec_ref(handle, ea_inode))
+			ext4_warning_inode(ea_inode, "cleanup dec ref error %d", err);
 		iput(ea_inode);
 		return err;
 	}
-- 
cgit 


From e3ea75ee651daf5e434afbfdb7dbf75e200ea1f6 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Tue, 4 Oct 2022 15:58:03 +0200
Subject: ext4: journal_path mount options should follow links

Before the commit 461c3af045d3 ("ext4: Change handle_mount_opt() to use
fs_parameter") ext4 mount option journal_path did follow links in the
provided path.

Bring this behavior back by allowing to pass pathwalk flags to
fs_lookup_param().

Fixes: 461c3af045d3 ("ext4: Change handle_mount_opt() to use fs_parameter")
Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Link: https://lore.kernel.org/r/20221004135803.32283-1-lczerner@redhat.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/super.c | 2 +-
 fs/fs_parser.c  | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4749b303f2a9..0f71542cf453 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2247,7 +2247,7 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
 			return -EINVAL;
 		}
 
-		error = fs_lookup_param(fc, param, 1, &path);
+		error = fs_lookup_param(fc, param, 1, LOOKUP_FOLLOW, &path);
 		if (error) {
 			ext4_msg(NULL, KERN_ERR, "error: could not find "
 				 "journal device path");
diff --git a/fs/fs_parser.c b/fs/fs_parser.c
index ed40ce5742fd..edb3712dcfa5 100644
--- a/fs/fs_parser.c
+++ b/fs/fs_parser.c
@@ -138,15 +138,16 @@ EXPORT_SYMBOL(__fs_parse);
  * @fc: The filesystem context to log errors through.
  * @param: The parameter.
  * @want_bdev: T if want a blockdev
+ * @flags: Pathwalk flags passed to filename_lookup()
  * @_path: The result of the lookup
  */
 int fs_lookup_param(struct fs_context *fc,
 		    struct fs_parameter *param,
 		    bool want_bdev,
+		    unsigned int flags,
 		    struct path *_path)
 {
 	struct filename *f;
-	unsigned int flags = 0;
 	bool put_f;
 	int ret;
 
-- 
cgit 


From 5f3e240321dd00b251f91a37c70fc551a140c87b Mon Sep 17 00:00:00 2001
From: changfengnan <changfengnan@bytedance.com>
Date: Sat, 8 Oct 2022 20:05:18 +0800
Subject: ext4: split ext4_journal_start trace for debug

we might want to know why jbd2 thread using high io for detail,
split ext4_journal_start trace to ext4_journal_start_sb and
ext4_journal_start_inode, show ino and handle type when possible.

Signed-off-by: changfengnan <changfengnan@bytedance.com>
Link: https://lore.kernel.org/r/20221008120518.74870-1-changfengnan@bytedance.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4_jbd2.c | 14 ++++++++++----
 fs/ext4/ext4_jbd2.h | 10 +++++-----
 fs/ext4/ialloc.c    |  4 ++--
 3 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 8e1fb18f465e..77f318ec8abb 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -86,15 +86,21 @@ static int ext4_journal_check_start(struct super_block *sb)
 	return 0;
 }
 
-handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
+handle_t *__ext4_journal_start_sb(struct inode *inode,
+				  struct super_block *sb, unsigned int line,
 				  int type, int blocks, int rsv_blocks,
 				  int revoke_creds)
 {
 	journal_t *journal;
 	int err;
-
-	trace_ext4_journal_start(sb, blocks, rsv_blocks, revoke_creds,
-				 _RET_IP_);
+	if (inode)
+		trace_ext4_journal_start_inode(inode, blocks, rsv_blocks,
+					revoke_creds, type,
+					_RET_IP_);
+	else
+		trace_ext4_journal_start_sb(sb, blocks, rsv_blocks,
+					revoke_creds, type,
+					_RET_IP_);
 	err = ext4_journal_check_start(sb);
 	if (err < 0)
 		return ERR_PTR(err);
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index db2ae4a2b38d..0c77697d5e90 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -261,9 +261,9 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
 	__ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \
 				     (bh))
 
-handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
-				  int type, int blocks, int rsv_blocks,
-				  int revoke_creds);
+handle_t *__ext4_journal_start_sb(struct inode *inode, struct super_block *sb,
+				  unsigned int line, int type, int blocks,
+				  int rsv_blocks, int revoke_creds);
 int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
 
 #define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
@@ -303,7 +303,7 @@ static inline int ext4_trans_default_revoke_credits(struct super_block *sb)
 }
 
 #define ext4_journal_start_sb(sb, type, nblocks)			\
-	__ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0,	\
+	__ext4_journal_start_sb(NULL, (sb), __LINE__, (type), (nblocks), 0,\
 				ext4_trans_default_revoke_credits(sb))
 
 #define ext4_journal_start(inode, type, nblocks)			\
@@ -323,7 +323,7 @@ static inline handle_t *__ext4_journal_start(struct inode *inode,
 					     int blocks, int rsv_blocks,
 					     int revoke_creds)
 {
-	return __ext4_journal_start_sb(inode->i_sb, line, type, blocks,
+	return __ext4_journal_start_sb(inode, inode->i_sb, line, type, blocks,
 				       rsv_blocks, revoke_creds);
 }
 
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index e9bc46684106..d701977678d4 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1076,8 +1076,8 @@ repeat_in_this_group:
 
 		if ((!(sbi->s_mount_state & EXT4_FC_REPLAY)) && !handle) {
 			BUG_ON(nblocks <= 0);
-			handle = __ext4_journal_start_sb(dir->i_sb, line_no,
-				 handle_type, nblocks, 0,
+			handle = __ext4_journal_start_sb(NULL, dir->i_sb,
+				 line_no, handle_type, nblocks, 0,
 				 ext4_trans_default_revoke_credits(sb));
 			if (IS_ERR(handle)) {
 				err = PTR_ERR(handle);
-- 
cgit 


From 9d720a5a658f5135861773f26e927449bef93d61 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 30 Nov 2022 09:25:51 -0800
Subject: xfs: hoist refcount record merge predicates

Hoist these multiline conditionals into separate static inline helpers
to improve readability and set the stage for corruption fixes that will
be introduced in the next patch.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Xiao Yang <yangx.jy@fujitsu.com>
---
 fs/xfs/libxfs/xfs_refcount.c | 129 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 113 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 3f34bafe18dd..4408893333a6 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -815,11 +815,119 @@ out_error:
 /* Is this extent valid? */
 static inline bool
 xfs_refc_valid(
-	struct xfs_refcount_irec	*rc)
+	const struct xfs_refcount_irec	*rc)
 {
 	return rc->rc_startblock != NULLAGBLOCK;
 }
 
+static inline bool
+xfs_refc_want_merge_center(
+	const struct xfs_refcount_irec	*left,
+	const struct xfs_refcount_irec	*cleft,
+	const struct xfs_refcount_irec	*cright,
+	const struct xfs_refcount_irec	*right,
+	bool				cleft_is_cright,
+	enum xfs_refc_adjust_op		adjust,
+	unsigned long long		*ulenp)
+{
+	unsigned long long		ulen = left->rc_blockcount;
+
+	/*
+	 * To merge with a center record, both shoulder records must be
+	 * adjacent to the record we want to adjust.  This is only true if
+	 * find_left and find_right made all four records valid.
+	 */
+	if (!xfs_refc_valid(left)  || !xfs_refc_valid(right) ||
+	    !xfs_refc_valid(cleft) || !xfs_refc_valid(cright))
+		return false;
+
+	/* There must only be one record for the entire range. */
+	if (!cleft_is_cright)
+		return false;
+
+	/* The shoulder record refcounts must match the new refcount. */
+	if (left->rc_refcount != cleft->rc_refcount + adjust)
+		return false;
+	if (right->rc_refcount != cleft->rc_refcount + adjust)
+		return false;
+
+	/*
+	 * The new record cannot exceed the max length.  ulen is a ULL as the
+	 * individual record block counts can be up to (u32 - 1) in length
+	 * hence we need to catch u32 addition overflows here.
+	 */
+	ulen += cleft->rc_blockcount + right->rc_blockcount;
+	if (ulen >= MAXREFCEXTLEN)
+		return false;
+
+	*ulenp = ulen;
+	return true;
+}
+
+static inline bool
+xfs_refc_want_merge_left(
+	const struct xfs_refcount_irec	*left,
+	const struct xfs_refcount_irec	*cleft,
+	enum xfs_refc_adjust_op		adjust)
+{
+	unsigned long long		ulen = left->rc_blockcount;
+
+	/*
+	 * For a left merge, the left shoulder record must be adjacent to the
+	 * start of the range.  If this is true, find_left made left and cleft
+	 * contain valid contents.
+	 */
+	if (!xfs_refc_valid(left) || !xfs_refc_valid(cleft))
+		return false;
+
+	/* Left shoulder record refcount must match the new refcount. */
+	if (left->rc_refcount != cleft->rc_refcount + adjust)
+		return false;
+
+	/*
+	 * The new record cannot exceed the max length.  ulen is a ULL as the
+	 * individual record block counts can be up to (u32 - 1) in length
+	 * hence we need to catch u32 addition overflows here.
+	 */
+	ulen += cleft->rc_blockcount;
+	if (ulen >= MAXREFCEXTLEN)
+		return false;
+
+	return true;
+}
+
+static inline bool
+xfs_refc_want_merge_right(
+	const struct xfs_refcount_irec	*cright,
+	const struct xfs_refcount_irec	*right,
+	enum xfs_refc_adjust_op		adjust)
+{
+	unsigned long long		ulen = right->rc_blockcount;
+
+	/*
+	 * For a right merge, the right shoulder record must be adjacent to the
+	 * end of the range.  If this is true, find_right made cright and right
+	 * contain valid contents.
+	 */
+	if (!xfs_refc_valid(right) || !xfs_refc_valid(cright))
+		return false;
+
+	/* Right shoulder record refcount must match the new refcount. */
+	if (right->rc_refcount != cright->rc_refcount + adjust)
+		return false;
+
+	/*
+	 * The new record cannot exceed the max length.  ulen is a ULL as the
+	 * individual record block counts can be up to (u32 - 1) in length
+	 * hence we need to catch u32 addition overflows here.
+	 */
+	ulen += cright->rc_blockcount;
+	if (ulen >= MAXREFCEXTLEN)
+		return false;
+
+	return true;
+}
+
 /*
  * Try to merge with any extents on the boundaries of the adjustment range.
  */
@@ -861,23 +969,15 @@ xfs_refcount_merge_extents(
 		 (cleft.rc_blockcount == cright.rc_blockcount);
 
 	/* Try to merge left, cleft, and right.  cleft must == cright. */
-	ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount +
-			right.rc_blockcount;
-	if (xfs_refc_valid(&left) && xfs_refc_valid(&right) &&
-	    xfs_refc_valid(&cleft) && xfs_refc_valid(&cright) && cequal &&
-	    left.rc_refcount == cleft.rc_refcount + adjust &&
-	    right.rc_refcount == cleft.rc_refcount + adjust &&
-	    ulen < MAXREFCEXTLEN) {
+	if (xfs_refc_want_merge_center(&left, &cleft, &cright, &right, cequal,
+				adjust, &ulen)) {
 		*shape_changed = true;
 		return xfs_refcount_merge_center_extents(cur, &left, &cleft,
 				&right, ulen, aglen);
 	}
 
 	/* Try to merge left and cleft. */
-	ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount;
-	if (xfs_refc_valid(&left) && xfs_refc_valid(&cleft) &&
-	    left.rc_refcount == cleft.rc_refcount + adjust &&
-	    ulen < MAXREFCEXTLEN) {
+	if (xfs_refc_want_merge_left(&left, &cleft, adjust)) {
 		*shape_changed = true;
 		error = xfs_refcount_merge_left_extent(cur, &left, &cleft,
 				agbno, aglen);
@@ -893,10 +993,7 @@ xfs_refcount_merge_extents(
 	}
 
 	/* Try to merge cright and right. */
-	ulen = (unsigned long long)right.rc_blockcount + cright.rc_blockcount;
-	if (xfs_refc_valid(&right) && xfs_refc_valid(&cright) &&
-	    right.rc_refcount == cright.rc_refcount + adjust &&
-	    ulen < MAXREFCEXTLEN) {
+	if (xfs_refc_want_merge_right(&cright, &right, adjust)) {
 		*shape_changed = true;
 		return xfs_refcount_merge_right_extent(cur, &right, &cright,
 				aglen);
-- 
cgit 


From b25d1984aa884fc91a73a5a407b9ac976d441e9b Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 30 Nov 2022 09:25:51 -0800
Subject: xfs: estimate post-merge refcounts correctly

Upon enabling fsdax + reflink for XFS, xfs/179 began to report refcount
metadata corruptions after being run.  Specifically, xfs_repair noticed
single-block refcount records that could be combined but had not been.

The root cause of this is improper MAXREFCOUNT edge case handling in
xfs_refcount_merge_extents.  When we're trying to find candidates for a
refcount btree record merge, we compute the refcount attribute of the
merged record, but we fail to account for the fact that once a record
hits rc_refcount == MAXREFCOUNT, it is pinned that way forever.  Hence
the computed refcount is wrong, and we fail to merge the extents.

Fix this by adjusting the merge predicates to compute the adjusted
refcount correctly.

Fixes: 3172725814f9 ("xfs: adjust refcount of an extent of blocks in refcount btree")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Xiao Yang <yangx.jy@fujitsu.com>
---
 fs/xfs/libxfs/xfs_refcount.c | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 4408893333a6..6f7ed9288fe4 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -820,6 +820,17 @@ xfs_refc_valid(
 	return rc->rc_startblock != NULLAGBLOCK;
 }
 
+static inline xfs_nlink_t
+xfs_refc_merge_refcount(
+	const struct xfs_refcount_irec	*irec,
+	enum xfs_refc_adjust_op		adjust)
+{
+	/* Once a record hits MAXREFCOUNT, it is pinned there forever */
+	if (irec->rc_refcount == MAXREFCOUNT)
+		return MAXREFCOUNT;
+	return irec->rc_refcount + adjust;
+}
+
 static inline bool
 xfs_refc_want_merge_center(
 	const struct xfs_refcount_irec	*left,
@@ -831,6 +842,7 @@ xfs_refc_want_merge_center(
 	unsigned long long		*ulenp)
 {
 	unsigned long long		ulen = left->rc_blockcount;
+	xfs_nlink_t			new_refcount;
 
 	/*
 	 * To merge with a center record, both shoulder records must be
@@ -846,9 +858,10 @@ xfs_refc_want_merge_center(
 		return false;
 
 	/* The shoulder record refcounts must match the new refcount. */
-	if (left->rc_refcount != cleft->rc_refcount + adjust)
+	new_refcount = xfs_refc_merge_refcount(cleft, adjust);
+	if (left->rc_refcount != new_refcount)
 		return false;
-	if (right->rc_refcount != cleft->rc_refcount + adjust)
+	if (right->rc_refcount != new_refcount)
 		return false;
 
 	/*
@@ -871,6 +884,7 @@ xfs_refc_want_merge_left(
 	enum xfs_refc_adjust_op		adjust)
 {
 	unsigned long long		ulen = left->rc_blockcount;
+	xfs_nlink_t			new_refcount;
 
 	/*
 	 * For a left merge, the left shoulder record must be adjacent to the
@@ -881,7 +895,8 @@ xfs_refc_want_merge_left(
 		return false;
 
 	/* Left shoulder record refcount must match the new refcount. */
-	if (left->rc_refcount != cleft->rc_refcount + adjust)
+	new_refcount = xfs_refc_merge_refcount(cleft, adjust);
+	if (left->rc_refcount != new_refcount)
 		return false;
 
 	/*
@@ -903,6 +918,7 @@ xfs_refc_want_merge_right(
 	enum xfs_refc_adjust_op		adjust)
 {
 	unsigned long long		ulen = right->rc_blockcount;
+	xfs_nlink_t			new_refcount;
 
 	/*
 	 * For a right merge, the right shoulder record must be adjacent to the
@@ -913,7 +929,8 @@ xfs_refc_want_merge_right(
 		return false;
 
 	/* Right shoulder record refcount must match the new refcount. */
-	if (right->rc_refcount != cright->rc_refcount + adjust)
+	new_refcount = xfs_refc_merge_refcount(cright, adjust);
+	if (right->rc_refcount != new_refcount)
 		return false;
 
 	/*
-- 
cgit 


From 8c25febf23963431686f04874b96321288504127 Mon Sep 17 00:00:00 2001
From: Guo Xuenan <guoxuenan@huawei.com>
Date: Thu, 1 Dec 2022 09:36:16 -0800
Subject: xfs: get rid of assert from xfs_btree_islastblock

xfs_btree_check_block contains debugging knobs. With XFS_DEBUG setting up,
turn on the debugging knob can trigger the assert of xfs_btree_islastblock,
test script as follows:

while true
do
    mount $disk $mountpoint
    fsstress -d $testdir -l 0 -n 10000 -p 4 >/dev/null
    echo 1 > /sys/fs/xfs/sda/errortag/btree_chk_sblk
    sleep 10
    umount $mountpoint
done

Kick off fsstress and only *then* turn on the debugging knob. If it
happens that the knob gets turned on after the cntbt lookup succeeds
but before the call to xfs_btree_islastblock, then we *can* end up in
the situation where a previously checked btree block suddenly starts
returning EFSCORRUPTED from xfs_btree_check_block. Kaboom.

Darrick give a very detailed explanation as follows:
Looking back at commit 27d9ee577dcce, I think the point of all this was
to make sure that the cursor has actually performed a lookup, and that
the btree block at whatever level we're asking about is ok.

If the caller hasn't ever done a lookup, the bc_levels array will be
empty, so cur->bc_levels[level].bp pointer will be NULL.  The call to
xfs_btree_get_block will crash anyway, so the "ASSERT(block);" part is
pointless.

If the caller did a lookup but the lookup failed due to block
corruption, the corresponding cur->bc_levels[level].bp pointer will also
be NULL, and we'll still crash.  The "ASSERT(xfs_btree_check_block);"
logic is also unnecessary.

If the cursor level points to an inode root, the block buffer will be
incore, so it had better always be consistent.

If the caller ignores a failed lookup after a successful one and calls
this function, the cursor state is garbage and the assert wouldn't have
tripped anyway. So get rid of the assert.

Fixes: 27d9ee577dcc ("xfs: actually check xfs_btree_check_block return in xfs_btree_islastblock")
Signed-off-by: Guo Xuenan <guoxuenan@huawei.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_btree.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index eef27858a013..29c4b4ccb909 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -556,7 +556,6 @@ xfs_btree_islastblock(
 	struct xfs_buf		*bp;
 
 	block = xfs_btree_get_block(cur, level, &bp);
-	ASSERT(block && xfs_btree_check_block(cur, block, level, bp) == 0);
 
 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
 		return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK);
-- 
cgit 


From ddfdd530e43fcb3f7a0a69966e5f6c33497b4ae3 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Thu, 1 Dec 2022 09:36:16 -0800
Subject: xfs: invalidate xfs_bufs when allocating cow extents

While investigating test failures in xfs/17[1-3] in alwayscow mode, I
noticed through code inspection that xfs_bmap_alloc_userdata isn't
setting XFS_ALLOC_USERDATA when allocating extents for a file's CoW
fork.  COW staging extents should be flagged as USERDATA, since user
data are persisted to these blocks before being remapped into a file.

This mis-classification has a few impacts on the behavior of the system.
First, the filestreams allocator is supposed to keep allocating from a
chosen AG until it runs out of space in that AG.  However, it only does
that for USERDATA allocations, which means that COW allocations aren't
tied to the filestreams AG.  Fortunately, few people use filestreams, so
nobody's noticed.

A more serious problem is that xfs_alloc_ag_vextent_small looks for a
buffer to invalidate *if* the USERDATA flag is set and the AG is so full
that the allocation had to come from the AGFL because the cntbt is
empty.  The consequences of not invalidating the buffer are severe --
if the AIL incorrectly checkpoints a buffer that is now being used to
store user data, that action will clobber the user's written data.

Fix filestreams and yet another data corruption vector by flagging COW
allocations as USERDATA.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/libxfs/xfs_bmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 56b9b7db38bb..0d56a8d862e8 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -4058,7 +4058,7 @@ xfs_bmap_alloc_userdata(
 	 * the busy list.
 	 */
 	bma->datatype = XFS_ALLOC_NOBUSY;
-	if (whichfork == XFS_DATA_FORK) {
+	if (whichfork == XFS_DATA_FORK || whichfork == XFS_COW_FORK) {
 		bma->datatype |= XFS_ALLOC_USERDATA;
 		if (bma->offset == 0)
 			bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
-- 
cgit 


From e0cefada1383c5ceb5a35f08369d0d40a6629c18 Mon Sep 17 00:00:00 2001
From: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
Date: Thu, 1 Dec 2022 20:58:19 +0800
Subject: fscrypt: Add SM4 XTS/CTS symmetric algorithm support

Add support for XTS and CTS mode variant of SM4 algorithm. The former is
used to encrypt file contents, while the latter (SM4-CTS-CBC) is used to
encrypt filenames.

SM4 is a symmetric algorithm widely used in China, and is even mandatory
algorithm in some special scenarios. We need to provide these users with
the ability to encrypt files or disks using SM4-XTS.

Signed-off-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20221201125819.36932-3-tianjia.zhang@linux.alibaba.com
---
 fs/crypto/keysetup.c | 15 +++++++++++++++
 fs/crypto/policy.c   |  5 +++++
 2 files changed, 20 insertions(+)

(limited to 'fs')

diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index 9e44dc078a81..94757ccd3056 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -44,6 +44,21 @@ struct fscrypt_mode fscrypt_modes[] = {
 		.security_strength = 16,
 		.ivsize = 16,
 	},
+	[FSCRYPT_MODE_SM4_XTS] = {
+		.friendly_name = "SM4-XTS",
+		.cipher_str = "xts(sm4)",
+		.keysize = 32,
+		.security_strength = 16,
+		.ivsize = 16,
+		.blk_crypto_mode = BLK_ENCRYPTION_MODE_SM4_XTS,
+	},
+	[FSCRYPT_MODE_SM4_CTS] = {
+		.friendly_name = "SM4-CTS-CBC",
+		.cipher_str = "cts(cbc(sm4))",
+		.keysize = 16,
+		.security_strength = 16,
+		.ivsize = 16,
+	},
 	[FSCRYPT_MODE_ADIANTUM] = {
 		.friendly_name = "Adiantum",
 		.cipher_str = "adiantum(xchacha12,aes)",
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 84fa51604b15..893661b52376 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -90,6 +90,11 @@ static bool fscrypt_valid_enc_modes_v2(u32 contents_mode, u32 filenames_mode)
 	if (contents_mode == FSCRYPT_MODE_AES_256_XTS &&
 	    filenames_mode == FSCRYPT_MODE_AES_256_HCTR2)
 		return true;
+
+	if (contents_mode == FSCRYPT_MODE_SM4_XTS &&
+	    filenames_mode == FSCRYPT_MODE_SM4_CTS)
+		return true;
+
 	return fscrypt_valid_enc_modes_v1(contents_mode, filenames_mode);
 }
 
-- 
cgit 


From e7f703ff2507f4e9f496da96cd4b78fd3026120c Mon Sep 17 00:00:00 2001
From: Wang Yufen <wangyufen@huawei.com>
Date: Fri, 2 Dec 2022 09:41:01 +0800
Subject: binfmt: Fix error return code in load_elf_fdpic_binary()

Fix to return a negative error code from create_elf_fdpic_tables()
instead of 0.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Cc: stable@vger.kernel.org
Signed-off-by: Wang Yufen <wangyufen@huawei.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/1669945261-30271-1-git-send-email-wangyufen@huawei.com
---
 fs/binfmt_elf_fdpic.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index e90c1192dec6..096e3520a0b1 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -434,8 +434,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
 	current->mm->start_stack = current->mm->start_brk + stack_size;
 #endif
 
-	if (create_elf_fdpic_tables(bprm, current->mm,
-				    &exec_params, &interp_params) < 0)
+	retval = create_elf_fdpic_tables(bprm, current->mm, &exec_params,
+					 &interp_params);
+	if (retval < 0)
 		goto error;
 
 	kdebug("- start_code  %lx", current->mm->start_code);
-- 
cgit 


From d6fdf29f7b99814d3673f2d9f4649262807cb836 Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Thu, 1 Dec 2022 17:01:03 +0100
Subject: posix_acl: Fix the type of sentinel in get_acl

The type should be struct posix_acl * instead of void *.

Cc: Christian Brauner <brauner@kernel.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/posix_acl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 989bbf280bfe..e6643db35cce 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -115,7 +115,7 @@ static struct posix_acl *__get_acl(struct user_namespace *mnt_userns,
 				   struct dentry *dentry, struct inode *inode,
 				   int type)
 {
-	void *sentinel;
+	struct posix_acl *sentinel;
 	struct posix_acl **p;
 	struct posix_acl *acl;
 
-- 
cgit 


From c65234b283a65cfbfc94619655e820a5e55199eb Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Mon, 17 Oct 2022 09:42:30 +0800
Subject: configfs: fix possible memory leak in configfs_create_dir()

kmemleak reported memory leaks in configfs_create_dir():

unreferenced object 0xffff888009f6af00 (size 192):
  comm "modprobe", pid 3777, jiffies 4295537735 (age 233.784s)
  backtrace:
    kmem_cache_alloc (mm/slub.c:3250 mm/slub.c:3256 mm/slub.c:3263 mm/slub.c:3273)
    new_fragment (./include/linux/slab.h:600 fs/configfs/dir.c:163)
    configfs_register_subsystem (fs/configfs/dir.c:1857)
    basic_write (drivers/hwtracing/stm/p_basic.c:14) stm_p_basic
    do_one_initcall (init/main.c:1296)
    do_init_module (kernel/module/main.c:2455)
    ...

unreferenced object 0xffff888003ba7180 (size 96):
  comm "modprobe", pid 3777, jiffies 4295537735 (age 233.784s)
  backtrace:
    kmem_cache_alloc (mm/slub.c:3250 mm/slub.c:3256 mm/slub.c:3263 mm/slub.c:3273)
    configfs_new_dirent (./include/linux/slab.h:723 fs/configfs/dir.c:194)
    configfs_make_dirent (fs/configfs/dir.c:248)
    configfs_create_dir (fs/configfs/dir.c:296)
    configfs_attach_group.isra.28 (fs/configfs/dir.c:816 fs/configfs/dir.c:852)
    configfs_register_subsystem (fs/configfs/dir.c:1881)
    basic_write (drivers/hwtracing/stm/p_basic.c:14) stm_p_basic
    do_one_initcall (init/main.c:1296)
    do_init_module (kernel/module/main.c:2455)
    ...

This is because the refcount is not correct in configfs_make_dirent().
For normal stage, the refcount is changing as:

configfs_register_subsystem()
  configfs_create_dir()
    configfs_make_dirent()
      configfs_new_dirent() # set s_count = 1
      dentry->d_fsdata = configfs_get(sd); # s_count = 2
...
configfs_unregister_subsystem()
  configfs_remove_dir()
    remove_dir()
      configfs_remove_dirent() # s_count = 1
    dput() ...
      *dentry_unlink_inode()*
        configfs_d_iput() # s_count = 0, release

However, if we failed in configfs_create():

configfs_register_subsystem()
  configfs_create_dir()
    configfs_make_dirent() # s_count = 2
    ...
    configfs_create() # fail
    ->out_remove:
    configfs_remove_dirent(dentry)
      configfs_put(sd) # s_count = 1
      return PTR_ERR(inode);

There is no inode in the error path, so the configfs_d_iput() is lost
and makes sd and fragment memory leaked.

To fix this, when we failed in configfs_create(), manually call
configfs_put(sd) to keep the refcount correct.

Fixes: 7063fbf22611 ("[PATCH] configfs: User-driven configuration filesystem")
Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/configfs/dir.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index d1f9d2632202..ec6519e1ca3b 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -316,6 +316,7 @@ static int configfs_create_dir(struct config_item *item, struct dentry *dentry,
 	return 0;
 
 out_remove:
+	configfs_put(dentry->d_fsdata);
 	configfs_remove_dirent(dentry);
 	return PTR_ERR(inode);
 }
@@ -382,6 +383,7 @@ int configfs_create_link(struct configfs_dirent *target, struct dentry *parent,
 	return 0;
 
 out_remove:
+	configfs_put(dentry->d_fsdata);
 	configfs_remove_dirent(dentry);
 	return PTR_ERR(inode);
 }
-- 
cgit 


From d9a4af5690e26afa8a2eb83c575d3a9ef52cde1d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 16 Nov 2022 17:27:14 +0106
Subject: printk: Convert console_drivers list to hlist

Replace the open coded single linked list with a hlist so a conversion
to SRCU protected list walks can reuse the existing primitives.

Co-developed-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20221116162152.193147-3-john.ogness@linutronix.de
---
 fs/proc/consoles.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
index dfe6ce3505ce..cf2e0788f9c7 100644
--- a/fs/proc/consoles.c
+++ b/fs/proc/consoles.c
@@ -74,8 +74,9 @@ static void *c_start(struct seq_file *m, loff_t *pos)
 static void *c_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	struct console *con = v;
+
 	++*pos;
-	return con->next;
+	return hlist_entry_safe(con->node.next, struct console, node);
 }
 
 static void c_stop(struct seq_file *m, void *v)
-- 
cgit 


From 8b5dd40088f7a3d85e6072f027ec50f309f64876 Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Wed, 16 Nov 2022 17:27:22 +0106
Subject: proc: consoles: document console_lock usage

The console_lock is held throughout the start/show/stop procedure
to print out device/driver information about all registered
consoles. Since the console_lock is being used for multiple reasons,
explicitly document these reasons. This will be useful when the
console_lock is split into fine-grained locking.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20221116162152.193147-11-john.ogness@linutronix.de
---
 fs/proc/consoles.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
index cf2e0788f9c7..46b305fa04ed 100644
--- a/fs/proc/consoles.c
+++ b/fs/proc/consoles.c
@@ -63,6 +63,15 @@ static void *c_start(struct seq_file *m, loff_t *pos)
 	struct console *con;
 	loff_t off = 0;
 
+	/*
+	 * Take console_lock to serialize device() callback with
+	 * other console operations. For example, fg_console is
+	 * modified under console_lock when switching vt.
+	 *
+	 * Hold the console_lock to guarantee safe traversal of the
+	 * console list. SRCU cannot be used because there is no
+	 * place to store the SRCU cookie.
+	 */
 	console_lock();
 	for_each_console(con)
 		if (off++ == *pos)
-- 
cgit 


From 28de287a95362f52489c5a3f90acfb2b99524dae Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Wed, 16 Nov 2022 17:27:46 +0106
Subject: proc: consoles: use console_list_lock for list iteration

The console_lock is used in part to guarantee safe list iteration.
The console_list_lock should be used because list synchronization
responsibility will be removed from the console_lock in a later
change.

Note, the console_lock is still needed to serialize the device()
callback with other console operations.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20221116162152.193147-35-john.ogness@linutronix.de
---
 fs/proc/consoles.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
index 46b305fa04ed..e0758fe7936d 100644
--- a/fs/proc/consoles.c
+++ b/fs/proc/consoles.c
@@ -33,7 +33,16 @@ static int show_console_dev(struct seq_file *m, void *v)
 	if (con->device) {
 		const struct tty_driver *driver;
 		int index;
+
+		/*
+		 * Take console_lock to serialize device() callback with
+		 * other console operations. For example, fg_console is
+		 * modified under console_lock when switching vt.
+		 */
+		console_lock();
 		driver = con->device(con, &index);
+		console_unlock();
+
 		if (driver) {
 			dev = MKDEV(driver->major, driver->minor_start);
 			dev += index;
@@ -64,15 +73,11 @@ static void *c_start(struct seq_file *m, loff_t *pos)
 	loff_t off = 0;
 
 	/*
-	 * Take console_lock to serialize device() callback with
-	 * other console operations. For example, fg_console is
-	 * modified under console_lock when switching vt.
-	 *
-	 * Hold the console_lock to guarantee safe traversal of the
+	 * Hold the console_list_lock to guarantee safe traversal of the
 	 * console list. SRCU cannot be used because there is no
 	 * place to store the SRCU cookie.
 	 */
-	console_lock();
+	console_list_lock();
 	for_each_console(con)
 		if (off++ == *pos)
 			break;
@@ -90,7 +95,7 @@ static void *c_next(struct seq_file *m, void *v, loff_t *pos)
 
 static void c_stop(struct seq_file *m, void *v)
 {
-	console_unlock();
+	console_list_unlock();
 }
 
 static const struct seq_operations consoles_op = {
-- 
cgit 


From d272e01fa0a2f15c5c331a37cd99c6875c7b7186 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Tue, 15 Nov 2022 09:35:10 -0600
Subject: ksmbd: replace one-element arrays with flexible-array members

One-element arrays are deprecated, and we are replacing them with flexible
array members instead. So, replace one-element arrays with flexible-array
members in multiple structs in fs/ksmbd/smb_common.h and one in
fs/ksmbd/smb2pdu.h.

Important to mention is that doing a build before/after this patch results
in no binary output differences.

This helps with the ongoing efforts to tighten the FORTIFY_SOURCE routines
on memcpy() and help us make progress towards globally enabling
-fstrict-flex-arrays=3 [1].

Link: https://github.com/KSPP/linux/issues/242
Link: https://github.com/KSPP/linux/issues/79
Link: https://gcc.gnu.org/pipermail/gcc-patches/2022-October/602902.html [1]
Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/Y3OxronfaPYv9qGP@work
---
 fs/ksmbd/smb2pdu.c    |  4 ++--
 fs/ksmbd/smb2pdu.h    |  2 +-
 fs/ksmbd/smb_common.h | 12 ++++++------
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
index b2fc85d440d0..31f00cec5255 100644
--- a/fs/ksmbd/smb2pdu.c
+++ b/fs/ksmbd/smb2pdu.c
@@ -3438,7 +3438,7 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level,
 		goto free_conv_name;
 	}
 
-	struct_sz = readdir_info_level_struct_sz(info_level) - 1 + conv_len;
+	struct_sz = readdir_info_level_struct_sz(info_level) + conv_len;
 	next_entry_offset = ALIGN(struct_sz, KSMBD_DIR_INFO_ALIGNMENT);
 	d_info->last_entry_off_align = next_entry_offset - struct_sz;
 
@@ -3690,7 +3690,7 @@ static int reserve_populate_dentry(struct ksmbd_dir_info *d_info,
 		return -EOPNOTSUPP;
 
 	conv_len = (d_info->name_len + 1) * 2;
-	next_entry_offset = ALIGN(struct_sz - 1 + conv_len,
+	next_entry_offset = ALIGN(struct_sz + conv_len,
 				  KSMBD_DIR_INFO_ALIGNMENT);
 
 	if (next_entry_offset > d_info->out_buf_len) {
diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h
index 092fdd3f8750..aa5dbe54f5a1 100644
--- a/fs/ksmbd/smb2pdu.h
+++ b/fs/ksmbd/smb2pdu.h
@@ -443,7 +443,7 @@ struct smb2_posix_info {
 	/* SidBuffer contain two sids (UNIX user sid(16), UNIX group sid(16)) */
 	u8 SidBuffer[32];
 	__le32 name_len;
-	u8 name[1];
+	u8 name[];
 	/*
 	 * var sized owner SID
 	 * var sized group SID
diff --git a/fs/ksmbd/smb_common.h b/fs/ksmbd/smb_common.h
index 318c16fa81da..e663ab9ea759 100644
--- a/fs/ksmbd/smb_common.h
+++ b/fs/ksmbd/smb_common.h
@@ -277,14 +277,14 @@ struct file_directory_info {
 	__le64 AllocationSize;
 	__le32 ExtFileAttributes;
 	__le32 FileNameLength;
-	char FileName[1];
+	char FileName[];
 } __packed;   /* level 0x101 FF resp data */
 
 struct file_names_info {
 	__le32 NextEntryOffset;
 	__u32 FileIndex;
 	__le32 FileNameLength;
-	char FileName[1];
+	char FileName[];
 } __packed;   /* level 0xc FF resp data */
 
 struct file_full_directory_info {
@@ -299,7 +299,7 @@ struct file_full_directory_info {
 	__le32 ExtFileAttributes;
 	__le32 FileNameLength;
 	__le32 EaSize;
-	char FileName[1];
+	char FileName[];
 } __packed; /* level 0x102 FF resp */
 
 struct file_both_directory_info {
@@ -317,7 +317,7 @@ struct file_both_directory_info {
 	__u8   ShortNameLength;
 	__u8   Reserved;
 	__u8   ShortName[24];
-	char FileName[1];
+	char FileName[];
 } __packed; /* level 0x104 FFrsp data */
 
 struct file_id_both_directory_info {
@@ -337,7 +337,7 @@ struct file_id_both_directory_info {
 	__u8   ShortName[24];
 	__le16 Reserved2;
 	__le64 UniqueId;
-	char FileName[1];
+	char FileName[];
 } __packed;
 
 struct file_id_full_dir_info {
@@ -354,7 +354,7 @@ struct file_id_full_dir_info {
 	__le32 EaSize; /* EA size */
 	__le32 Reserved;
 	__le64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit*/
-	char FileName[1];
+	char FileName[];
 } __packed; /* level 0x105 FF rsp data */
 
 struct smb_version_values {
-- 
cgit 


From 6a46bf558803dd2b959ca7435a5c143efe837217 Mon Sep 17 00:00:00 2001
From: Liu Shixin <liushixin2@huawei.com>
Date: Wed, 2 Nov 2022 10:51:23 +0800
Subject: binfmt_misc: fix shift-out-of-bounds in check_special_flags

UBSAN reported a shift-out-of-bounds warning:

 left shift of 1 by 31 places cannot be represented in type 'int'
 Call Trace:
  <TASK>
  __dump_stack lib/dump_stack.c:88 [inline]
  dump_stack_lvl+0x8d/0xcf lib/dump_stack.c:106
  ubsan_epilogue+0xa/0x44 lib/ubsan.c:151
  __ubsan_handle_shift_out_of_bounds+0x1e7/0x208 lib/ubsan.c:322
  check_special_flags fs/binfmt_misc.c:241 [inline]
  create_entry fs/binfmt_misc.c:456 [inline]
  bm_register_write+0x9d3/0xa20 fs/binfmt_misc.c:654
  vfs_write+0x11e/0x580 fs/read_write.c:582
  ksys_write+0xcf/0x120 fs/read_write.c:637
  do_syscall_x64 arch/x86/entry/common.c:50 [inline]
  do_syscall_64+0x34/0x80 arch/x86/entry/common.c:80
  entry_SYSCALL_64_after_hwframe+0x63/0xcd
 RIP: 0033:0x4194e1

Since the type of Node's flags is unsigned long, we should define these
macros with same type too.

Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20221102025123.1117184-1-liushixin2@huawei.com
---
 fs/binfmt_misc.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index e1eae7ea823a..bb202ad369d5 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -44,10 +44,10 @@ static LIST_HEAD(entries);
 static int enabled = 1;
 
 enum {Enabled, Magic};
-#define MISC_FMT_PRESERVE_ARGV0 (1 << 31)
-#define MISC_FMT_OPEN_BINARY (1 << 30)
-#define MISC_FMT_CREDENTIALS (1 << 29)
-#define MISC_FMT_OPEN_FILE (1 << 28)
+#define MISC_FMT_PRESERVE_ARGV0 (1UL << 31)
+#define MISC_FMT_OPEN_BINARY (1UL << 30)
+#define MISC_FMT_CREDENTIALS (1UL << 29)
+#define MISC_FMT_OPEN_FILE (1UL << 28)
 
 typedef struct {
 	struct list_head list;
-- 
cgit 


From e1fce564900f8734edf15b87f028c57e14f6e28d Mon Sep 17 00:00:00 2001
From: Wang Yufen <wangyufen@huawei.com>
Date: Fri, 2 Dec 2022 16:22:54 +0800
Subject: pstore/ram: Fix error return code in ramoops_probe()

In the if (dev_of_node(dev) && !pdata) path, the "err" may be assigned a
value of 0, so the error return code -EINVAL may be incorrectly set
to 0. To fix set valid return code before calling to goto.

Fixes: 35da60941e44 ("pstore/ram: add Device Tree bindings")
Signed-off-by: Wang Yufen <wangyufen@huawei.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/1669969374-46582-1-git-send-email-wangyufen@huawei.com
---
 fs/pstore/ram.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 5a39fa8522b0..9a5052431fd3 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -745,6 +745,7 @@ static int ramoops_probe(struct platform_device *pdev)
 	/* Make sure we didn't get bogus platform data pointer. */
 	if (!pdata) {
 		pr_err("NULL platform data\n");
+		err = -EINVAL;
 		goto fail_out;
 	}
 
@@ -752,6 +753,7 @@ static int ramoops_probe(struct platform_device *pdev)
 			!pdata->ftrace_size && !pdata->pmsg_size)) {
 		pr_err("The memory size and the record/console size must be "
 			"non-zero\n");
+		err = -EINVAL;
 		goto fail_out;
 	}
 
-- 
cgit 


From 1f5619ed881081be300db61da552ffae7163bb72 Mon Sep 17 00:00:00 2001
From: Yang Li <yang.lee@linux.alibaba.com>
Date: Fri, 2 Dec 2022 13:30:51 -0800
Subject: xfs: Remove duplicated include in xfs_iomap.c

./fs/xfs/xfs_iomap.c: xfs_error.h is included more than once.
./fs/xfs/xfs_iomap.c: xfs_errortag.h is included more than once.

Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=3337
Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Signed-off-by: Yang Li <yang.lee@linux.alibaba.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_iomap.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 68436370927d..43f447199c08 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -27,8 +27,6 @@
 #include "xfs_dquot_item.h"
 #include "xfs_dquot.h"
 #include "xfs_reflink.h"
-#include "xfs_error.h"
-#include "xfs_errortag.h"
 
 #define XFS_ALLOC_ALIGN(mp, off) \
 	(((off) >> mp->m_allocsize_log) << mp->m_allocsize_log)
-- 
cgit 


From 7868f93006ad27c00ecddcc2904118aa705459ca Mon Sep 17 00:00:00 2001
From: Bartosz Taudul <wolf@nereid.pl>
Date: Sat, 3 Dec 2022 01:07:24 +0000
Subject: udf: Increase UDF_MAX_READ_VERSION to 0x0260

Some discs containing the UDF file system are unable to be mounted,
failing with the following message:

  UDF-fs: error (device sr0): udf_fill_super: minUDFReadRev=260
    (max is 250)

The UDF 2.60 specification [0] states in the section Basic Restrictions
& Requirements (page 10):

  The Minimum UDF Read Revision value shall be at most #0250 for all
  media with a UDF 2.60 file system. This indicates that a UDF 2.50
  implementation can read all UDF 2.60 media. Media that do not have a
  Metadata Partition may use a value lower than #250.

The conclusion is that the discs failing to mount were burned with a
faulty software, which didn't follow the specification. This can be
worked around by increasing UDF_MAX_READ_VERSION to 0x260, to match the
Minimum Read Revision. No other changes are required, as reading UDF
2.60 is backward compatible with UDF 2.50.

[0] http://www.osta.org/specs/pdf/udf260.pdf

Signed-off-by: Bartosz Taudul <wolf@nereid.pl>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/udf_sb.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 4fa620543d30..291b56dd011e 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -6,7 +6,11 @@
 #include <linux/bitops.h>
 #include <linux/magic.h>
 
-#define UDF_MAX_READ_VERSION		0x0250
+/*
+ * Even UDF 2.6 media should have version <= 0x250 but apparently there are
+ * some broken filesystems with version set to 0x260. Accommodate those.
+ */
+#define UDF_MAX_READ_VERSION		0x0260
 #define UDF_MAX_WRITE_VERSION		0x0201
 
 #define UDF_FLAG_USE_EXTENDED_FE	0
-- 
cgit 


From 83ae4133ac9410ac6a57136e464d498dc66200cf Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 30 Sep 2022 16:45:09 -0400
Subject: btrfs: add a cached_state to try_lock_extent

With nowait becoming more pervasive throughout our codebase go ahead and
add a cached_state to try_lock_extent().  This allows us to be faster
about clearing the locked area if we have contention, and then gives us
the same optimization for unlock if we are able to lock the range.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-io-tree.c | 7 ++++---
 fs/btrfs/extent-io-tree.h | 3 ++-
 fs/btrfs/extent_io.c      | 3 ++-
 fs/btrfs/file.c           | 3 ++-
 fs/btrfs/inode.c          | 3 ++-
 fs/btrfs/ordered-data.c   | 2 +-
 fs/btrfs/relocation.c     | 2 +-
 7 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index 83cb0378096f..1b0a45b51f4c 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -1615,17 +1615,18 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 				  changeset);
 }
 
-int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+		    struct extent_state **cached)
 {
 	int err;
 	u64 failed_start;
 
 	err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start,
-			       NULL, NULL, GFP_NOFS);
+			       cached, NULL, GFP_NOFS);
 	if (err == -EEXIST) {
 		if (failed_start > start)
 			clear_extent_bit(tree, start, failed_start - 1,
-					 EXTENT_LOCKED, NULL);
+					 EXTENT_LOCKED, cached);
 		return 0;
 	}
 	return 1;
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index a855f40dd61d..786be8f38f0b 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -106,7 +106,8 @@ void extent_io_tree_release(struct extent_io_tree *tree);
 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
 		struct extent_state **cached);
 
-int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+		    struct extent_state **cached);
 
 int __init extent_state_init_cachep(void);
 void __cold extent_state_free_cachep(void);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4dcf22e051ff..c7e94a0e60d5 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4959,7 +4959,8 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
 	io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
 
 	if (wait == WAIT_NONE) {
-		if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1))
+		if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1,
+				     NULL))
 			return -EAGAIN;
 	} else {
 		ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1, NULL);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index d01631d47806..98107466572b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1302,7 +1302,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
 		struct btrfs_ordered_extent *ordered;
 
 		if (nowait) {
-			if (!try_lock_extent(&inode->io_tree, start_pos, last_pos)) {
+			if (!try_lock_extent(&inode->io_tree, start_pos, last_pos,
+					     cached_state)) {
 				for (i = 0; i < num_pages; i++) {
 					unlock_page(pages[i]);
 					put_page(pages[i]);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0e516aefbf51..2ba2d8b9cefc 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7255,7 +7255,8 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
 
 	while (1) {
 		if (nowait) {
-			if (!try_lock_extent(io_tree, lockstart, lockend))
+			if (!try_lock_extent(io_tree, lockstart, lockend,
+					     cached_state))
 				return -EAGAIN;
 		} else {
 			lock_extent(io_tree, lockstart, lockend, cached_state);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index e54f8280031f..b648c9d4ea0f 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -1073,7 +1073,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end)
 {
 	struct btrfs_ordered_extent *ordered;
 
-	if (!try_lock_extent(&inode->io_tree, start, end))
+	if (!try_lock_extent(&inode->io_tree, start, end, NULL))
 		return false;
 
 	ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 666a37a0ee89..e81a21082e58 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1120,7 +1120,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
 				WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
 				end--;
 				ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
-						      key.offset, end);
+						      key.offset, end, NULL);
 				if (!ret)
 					continue;
 
-- 
cgit 


From 632ddfa2131f0fea1831bc1f4b28c68faa779156 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 30 Sep 2022 16:45:10 -0400
Subject: btrfs: use cached_state for btrfs_check_nocow_lock

Now that try_lock_extent() takes a cached_state, plumb the cached_state
through btrfs_try_lock_ordered_range() and then use a cached_state in
btrfs_check_nocow_lock everywhere to avoid extra tree searches on the
extent_io_tree.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c         | 9 ++++++---
 fs/btrfs/ordered-data.c | 7 ++++---
 fs/btrfs/ordered-data.h | 3 ++-
 3 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 98107466572b..493cae66e5e6 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1373,6 +1373,7 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	struct btrfs_root *root = inode->root;
+	struct extent_state *cached_state = NULL;
 	u64 lockstart, lockend;
 	u64 num_bytes;
 	int ret;
@@ -1389,12 +1390,14 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
 	num_bytes = lockend - lockstart + 1;
 
 	if (nowait) {
-		if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend)) {
+		if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend,
+						  &cached_state)) {
 			btrfs_drew_write_unlock(&root->snapshot_lock);
 			return -EAGAIN;
 		}
 	} else {
-		btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, NULL);
+		btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend,
+						   &cached_state);
 	}
 	ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
 			NULL, NULL, NULL, nowait, false);
@@ -1403,7 +1406,7 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
 	else
 		*write_bytes = min_t(size_t, *write_bytes ,
 				     num_bytes - pos + lockstart);
-	unlock_extent(&inode->io_tree, lockstart, lockend, NULL);
+	unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
 
 	return ret;
 }
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index b648c9d4ea0f..de2b716d3e7b 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -1069,11 +1069,12 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
  * Return true if btrfs_lock_ordered_range does not return any extents,
  * otherwise false.
  */
-bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end)
+bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
+				  struct extent_state **cached_state)
 {
 	struct btrfs_ordered_extent *ordered;
 
-	if (!try_lock_extent(&inode->io_tree, start, end, NULL))
+	if (!try_lock_extent(&inode->io_tree, start, end, cached_state))
 		return false;
 
 	ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1);
@@ -1081,7 +1082,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end)
 		return true;
 
 	btrfs_put_ordered_extent(ordered);
-	unlock_extent(&inode->io_tree, start, end, NULL);
+	unlock_extent(&inode->io_tree, start, end, cached_state);
 
 	return false;
 }
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index f59f2dbdb25e..89f82b78f590 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -206,7 +206,8 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
 void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
 					u64 end,
 					struct extent_state **cached_state);
-bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end);
+bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
+				  struct extent_state **cached_state);
 int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
 			       u64 post);
 int __init ordered_data_init(void);
-- 
cgit 


From 9c5c9604631ae5fcdc5124c79d01d75f80b5ffd4 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 30 Sep 2022 16:45:11 -0400
Subject: btrfs: use a cached_state everywhere in relocation

All of the relocation code avoids using the cached state, despite
everywhere using the normal

  lock_extent()
  // do something
  unlock_extent()

pattern.  Fix this by plumbing a cached state throughout all of these
functions in order to allow for less tree searches.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/relocation.c | 40 ++++++++++++++++++++++++++--------------
 1 file changed, 26 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index e81a21082e58..9dcf9b39c798 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1113,6 +1113,8 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
 				inode = find_next_inode(root, key.objectid);
 			}
 			if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) {
+				struct extent_state *cached_state = NULL;
+
 				end = key.offset +
 				      btrfs_file_extent_num_bytes(leaf, fi);
 				WARN_ON(!IS_ALIGNED(key.offset,
@@ -1120,14 +1122,15 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
 				WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
 				end--;
 				ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
-						      key.offset, end, NULL);
+						      key.offset, end,
+						      &cached_state);
 				if (!ret)
 					continue;
 
 				btrfs_drop_extent_map_range(BTRFS_I(inode),
 							    key.offset, end, true);
 				unlock_extent(&BTRFS_I(inode)->io_tree,
-					      key.offset, end, NULL);
+					      key.offset, end, &cached_state);
 			}
 		}
 
@@ -1516,6 +1519,8 @@ static int invalidate_extent_cache(struct btrfs_root *root,
 
 	objectid = min_key->objectid;
 	while (1) {
+		struct extent_state *cached_state = NULL;
+
 		cond_resched();
 		iput(inode);
 
@@ -1566,9 +1571,9 @@ static int invalidate_extent_cache(struct btrfs_root *root,
 		}
 
 		/* the lock_extent waits for read_folio to complete */
-		lock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL);
+		lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
 		btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, true);
-		unlock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL);
+		unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
 	}
 	return 0;
 }
@@ -2863,19 +2868,21 @@ static noinline_for_stack int prealloc_file_extent_cluster(
 
 	btrfs_inode_lock(&inode->vfs_inode, 0);
 	for (nr = 0; nr < cluster->nr; nr++) {
+		struct extent_state *cached_state = NULL;
+
 		start = cluster->boundary[nr] - offset;
 		if (nr + 1 < cluster->nr)
 			end = cluster->boundary[nr + 1] - 1 - offset;
 		else
 			end = cluster->end - offset;
 
-		lock_extent(&inode->io_tree, start, end, NULL);
+		lock_extent(&inode->io_tree, start, end, &cached_state);
 		num_bytes = end + 1 - start;
 		ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start,
 						num_bytes, num_bytes,
 						end + 1, &alloc_hint);
 		cur_offset = end + 1;
-		unlock_extent(&inode->io_tree, start, end, NULL);
+		unlock_extent(&inode->io_tree, start, end, &cached_state);
 		if (ret)
 			break;
 	}
@@ -2891,6 +2898,7 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod
 				u64 start, u64 end, u64 block_start)
 {
 	struct extent_map *em;
+	struct extent_state *cached_state = NULL;
 	int ret = 0;
 
 	em = alloc_extent_map();
@@ -2903,9 +2911,9 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod
 	em->block_start = block_start;
 	set_bit(EXTENT_FLAG_PINNED, &em->flags);
 
-	lock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL);
+	lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
 	ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, false);
-	unlock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL);
+	unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
 	free_extent_map(em);
 
 	return ret;
@@ -2983,6 +2991,7 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
 	 */
 	cur = max(page_start, cluster->boundary[*cluster_nr] - offset);
 	while (cur <= page_end) {
+		struct extent_state *cached_state = NULL;
 		u64 extent_start = cluster->boundary[*cluster_nr] - offset;
 		u64 extent_end = get_cluster_boundary_end(cluster,
 						*cluster_nr) - offset;
@@ -2998,13 +3007,15 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
 			goto release_page;
 
 		/* Mark the range delalloc and dirty for later writeback */
-		lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, NULL);
+		lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end,
+			    &cached_state);
 		ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start,
-						clamped_end, 0, NULL);
+						clamped_end, 0, &cached_state);
 		if (ret) {
-			clear_extent_bits(&BTRFS_I(inode)->io_tree,
-					clamped_start, clamped_end,
-					EXTENT_LOCKED | EXTENT_BOUNDARY);
+			clear_extent_bit(&BTRFS_I(inode)->io_tree,
+					 clamped_start, clamped_end,
+					 EXTENT_LOCKED | EXTENT_BOUNDARY,
+					 &cached_state);
 			btrfs_delalloc_release_metadata(BTRFS_I(inode),
 							clamped_len, true);
 			btrfs_delalloc_release_extents(BTRFS_I(inode),
@@ -3031,7 +3042,8 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
 					boundary_start, boundary_end,
 					EXTENT_BOUNDARY);
 		}
-		unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, NULL);
+		unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end,
+			      &cached_state);
 		btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len);
 		cur += clamped_len;
 
-- 
cgit 


From 123a7f008c9e2b25b451c116620f1f6c77bd6b2b Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 30 Sep 2022 16:45:12 -0400
Subject: btrfs: cache the failed state when locking extents

Currently if we fail to lock a range we'll return the start of the range
that we failed to lock.  We'll then search down to this range and wait
on any extent states in this range.

However we can avoid this search altogether if we simply cache the
extent_state that had the contention.  We can pass this into
wait_extent_bit() and start from that extent_state without doing the
search.  In the most optimistic case we can avoid all searches, more
likely we'll avoid the initial search and have to perform the search
after we wait on the failed state, or worst case we must search both
times which is what currently happens.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-io-tree.c | 52 +++++++++++++++++++++++++++++++++++------------
 fs/btrfs/extent-io-tree.h |  3 ++-
 fs/btrfs/extent_io.c      |  3 ++-
 3 files changed, 43 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index 1b0a45b51f4c..a630c771d25c 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -714,7 +714,8 @@ static void wait_on_state(struct extent_io_tree *tree,
  * The range [start, end] is inclusive.
  * The tree lock is taken by this function
  */
-void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits)
+void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
+		     struct extent_state **cached_state)
 {
 	struct extent_state *state;
 
@@ -722,6 +723,16 @@ void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits)
 
 	spin_lock(&tree->lock);
 again:
+	/*
+	 * Maintain cached_state, as we may not remove it from the tree if there
+	 * are more bits than the bits we're waiting on set on this state.
+	 */
+	if (cached_state && *cached_state) {
+		state = *cached_state;
+		if (extent_state_in_tree(state) &&
+		    state->start <= start && start < state->end)
+			goto process_node;
+	}
 	while (1) {
 		/*
 		 * This search will find all the extents that end after our
@@ -752,6 +763,12 @@ process_node:
 		}
 	}
 out:
+	/* This state is no longer useful, clear it and free it up. */
+	if (cached_state && *cached_state) {
+		state = *cached_state;
+		*cached_state = NULL;
+		free_extent_state(state);
+	}
 	spin_unlock(&tree->lock);
 }
 
@@ -939,13 +956,17 @@ out:
  * sleeping, so the gfp mask is used to indicate what is allowed.
  *
  * If any of the exclusive bits are set, this will fail with -EEXIST if some
- * part of the range already has the desired bits set.  The start of the
- * existing range is returned in failed_start in this case.
+ * part of the range already has the desired bits set.  The extent_state of the
+ * existing range is returned in failed_state in this case, and the start of the
+ * existing range is returned in failed_start.  failed_state is used as an
+ * optimization for wait_extent_bit, failed_start must be used as the source of
+ * truth as failed_state may have changed since we returned.
  *
  * [start, end] is inclusive This takes the tree lock.
  */
 static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 			    u32 bits, u64 *failed_start,
+			    struct extent_state **failed_state,
 			    struct extent_state **cached_state,
 			    struct extent_changeset *changeset, gfp_t mask)
 {
@@ -964,7 +985,7 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	if (exclusive_bits)
 		ASSERT(failed_start);
 	else
-		ASSERT(failed_start == NULL);
+		ASSERT(failed_start == NULL && failed_state == NULL);
 again:
 	if (!prealloc && gfpflags_allow_blocking(mask)) {
 		/*
@@ -1012,6 +1033,7 @@ hit_next:
 	if (state->start == start && state->end <= end) {
 		if (state->state & exclusive_bits) {
 			*failed_start = state->start;
+			cache_state(state, failed_state);
 			err = -EEXIST;
 			goto out;
 		}
@@ -1047,6 +1069,7 @@ hit_next:
 	if (state->start < start) {
 		if (state->state & exclusive_bits) {
 			*failed_start = start;
+			cache_state(state, failed_state);
 			err = -EEXIST;
 			goto out;
 		}
@@ -1125,6 +1148,7 @@ hit_next:
 	if (state->start <= end && state->end > end) {
 		if (state->state & exclusive_bits) {
 			*failed_start = start;
+			cache_state(state, failed_state);
 			err = -EEXIST;
 			goto out;
 		}
@@ -1162,8 +1186,8 @@ out:
 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		   u32 bits, struct extent_state **cached_state, gfp_t mask)
 {
-	return __set_extent_bit(tree, start, end, bits, NULL, cached_state,
-				NULL, mask);
+	return __set_extent_bit(tree, start, end, bits, NULL, NULL,
+				cached_state, NULL, mask);
 }
 
 /*
@@ -1598,8 +1622,8 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 	 */
 	ASSERT(!(bits & EXTENT_LOCKED));
 
-	return __set_extent_bit(tree, start, end, bits, NULL, NULL, changeset,
-				GFP_NOFS);
+	return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL,
+				changeset, GFP_NOFS);
 }
 
 int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
@@ -1622,7 +1646,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
 	u64 failed_start;
 
 	err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start,
-			       cached, NULL, GFP_NOFS);
+			       NULL, cached, NULL, GFP_NOFS);
 	if (err == -EEXIST) {
 		if (failed_start > start)
 			clear_extent_bit(tree, start, failed_start - 1,
@@ -1639,20 +1663,22 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
 		struct extent_state **cached_state)
 {
+	struct extent_state *failed_state = NULL;
 	int err;
 	u64 failed_start;
 
 	err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start,
-			       cached_state, NULL, GFP_NOFS);
+			       &failed_state, cached_state, NULL, GFP_NOFS);
 	while (err == -EEXIST) {
 		if (failed_start != start)
 			clear_extent_bit(tree, start, failed_start - 1,
 					 EXTENT_LOCKED, cached_state);
 
-		wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
+		wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED,
+				&failed_state);
 		err = __set_extent_bit(tree, start, end, EXTENT_LOCKED,
-				       &failed_start, cached_state, NULL,
-				       GFP_NOFS);
+				       &failed_start, &failed_state,
+				       cached_state, NULL, GFP_NOFS);
 	}
 	return err;
 }
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index 786be8f38f0b..c71aa29f719d 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -235,6 +235,7 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
 bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
 			       u64 *end, u64 max_bytes,
 			       struct extent_state **cached_state);
-void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits);
+void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
+		     struct extent_state **cached_state);
 
 #endif /* BTRFS_EXTENT_IO_TREE_H */
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c7e94a0e60d5..836851233618 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -5001,7 +5001,8 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
 	if (ret || wait != WAIT_COMPLETE)
 		return ret;
 
-	wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, EXTENT_LOCKED);
+	wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1,
+			EXTENT_LOCKED, NULL);
 	if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
 		ret = -EIO;
 	return ret;
-- 
cgit 


From e5e886bad9e9e87b767ade3884faec1cfdec9b43 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 30 Sep 2022 16:45:13 -0400
Subject: btrfs: add cached_state to read_extent_buffer_subpage

We don't use a cached state here at all, which generally makes sense as
async reads are going to unlock at endio time.  However for blocking
reads we will call wait_extent_bit() for our range.  Since the
lock_extent() stuff will return the cached_state for the start of the
range this is a helpful optimization to have for this case, we'll have
the exact state we want to wait on.  Add a cached state here and simply
throw it away if we're a non-blocking read, otherwise we'll get a small
improvement by eliminating some tree searches.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 836851233618..7891d375eb43 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4949,6 +4949,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
 	struct btrfs_fs_info *fs_info = eb->fs_info;
 	struct extent_io_tree *io_tree;
 	struct page *page = eb->pages[0];
+	struct extent_state *cached_state = NULL;
 	struct btrfs_bio_ctrl bio_ctrl = {
 		.mirror_num = mirror_num,
 	};
@@ -4960,10 +4961,11 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
 
 	if (wait == WAIT_NONE) {
 		if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1,
-				     NULL))
+				     &cached_state))
 			return -EAGAIN;
 	} else {
-		ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1, NULL);
+		ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1,
+				  &cached_state);
 		if (ret < 0)
 			return ret;
 	}
@@ -4973,7 +4975,8 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
 	    PageUptodate(page) ||
 	    btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) {
 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
-		unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, NULL);
+		unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
+			      &cached_state);
 		return ret;
 	}
 
@@ -4998,11 +5001,13 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
 		atomic_dec(&eb->io_pages);
 	}
 	submit_one_bio(&bio_ctrl);
-	if (ret || wait != WAIT_COMPLETE)
+	if (ret || wait != WAIT_COMPLETE) {
+		free_extent_state(cached_state);
 		return ret;
+	}
 
 	wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1,
-			EXTENT_LOCKED, NULL);
+			EXTENT_LOCKED, &cached_state);
 	if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
 		ret = -EIO;
 	return ret;
-- 
cgit 


From c1b078545e64da7a2b67b74b9d12813b5dd2a529 Mon Sep 17 00:00:00 2001
From: Peng Hao <flyingpeng@tencent.com>
Date: Fri, 7 Oct 2022 18:33:35 +0200
Subject: btrfs: simplify cleanup after error in btrfs_create_tree

Since leaf is already NULL, and no other branch will go to fail_unlock,
the fail_unlock label is useless and can be removed

Signed-off-by: Peng Hao <flyingpeng@tencent.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d99bf7c64611..3460eaa9e4aa 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1197,7 +1197,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
 	if (IS_ERR(leaf)) {
 		ret = PTR_ERR(leaf);
 		leaf = NULL;
-		goto fail_unlock;
+		goto fail;
 	}
 
 	root->node = leaf;
@@ -1232,9 +1232,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
 
 	return root;
 
-fail_unlock:
-	if (leaf)
-		btrfs_tree_unlock(leaf);
 fail:
 	btrfs_put_root(root);
 
-- 
cgit 


From d60d956eb41f0945c2438af11ed412109c558f4f Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 14 Sep 2022 11:06:25 -0400
Subject: btrfs: remove unused set/clear_pending_info helpers

The last users of these helpers were removed in 5297199a8bca ("btrfs:
remove inode number cache feature") so delete these helpers.

The point was for mount options that were applicable after transaction
commit so they could not be applied immediately. We don't have such
options anymore and if we do the patch can be reverted.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h | 24 ------------------------
 1 file changed, 24 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9e6d48ff4597..3be1db2c558f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1661,30 +1661,6 @@ do {									\
 #define btrfs_clear_pending(info, opt)	\
 	clear_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
 
-/*
- * Helpers for setting pending mount option changes.
- *
- * Expects corresponding macros
- * BTRFS_PENDING_SET_ and CLEAR_ + short mount option name
- */
-#define btrfs_set_pending_and_info(info, opt, fmt, args...)            \
-do {                                                                   \
-       if (!btrfs_raw_test_opt((info)->mount_opt, opt)) {              \
-               btrfs_info((info), fmt, ##args);                        \
-               btrfs_set_pending((info), SET_##opt);                   \
-               btrfs_clear_pending((info), CLEAR_##opt);               \
-       }                                                               \
-} while(0)
-
-#define btrfs_clear_pending_and_info(info, opt, fmt, args...)          \
-do {                                                                   \
-       if (btrfs_raw_test_opt((info)->mount_opt, opt)) {               \
-               btrfs_info((info), fmt, ##args);                        \
-               btrfs_set_pending((info), CLEAR_##opt);                 \
-               btrfs_clear_pending((info), SET_##opt);                 \
-       }                                                               \
-} while(0)
-
 /*
  * Inode flags
  */
-- 
cgit 


From ea206640a600c808d30d8d62c44b6982c12a143b Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 14 Sep 2022 11:06:26 -0400
Subject: btrfs: remove unused BTRFS_TOTAL_BYTES_PINNED_BATCH

This hasn't been used since 138a12d86574 ("btrfs: rip out
btrfs_space_info::total_bytes_pinned") so it is safe to remove.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3be1db2c558f..8bd49328efab 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -92,14 +92,6 @@ struct reloc_control;
 
 #define BTRFS_DIRTY_METADATA_THRESH	SZ_32M
 
-/*
- * Use large batch size to reduce overhead of metadata updates.  On the reader
- * side, we only read it when we are close to ENOSPC and the read overhead is
- * mostly related to the number of CPUs, so it is OK to use arbitrary large
- * value here.
- */
-#define BTRFS_TOTAL_BYTES_PINNED_BATCH	SZ_128M
-
 #define BTRFS_MAX_EXTENT_SIZE SZ_128M
 
 /*
-- 
cgit 


From 4ce76e8e783693c72956e817f3a6dbbaa055411a Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 14 Sep 2022 11:06:27 -0400
Subject: btrfs: remove unused BTRFS_IOPRIO_READA

The last user of this definition was removed in patch f26c92386028
("btrfs: remove reada infrastructure") so we can remove this definition.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8bd49328efab..7760cc07288e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -87,9 +87,6 @@ struct reloc_control;
 
 #define BTRFS_EMPTY_DIR_SIZE 0
 
-/* ioprio of readahead is set to idle */
-#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
-
 #define BTRFS_DIRTY_METADATA_THRESH	SZ_32M
 
 #define BTRFS_MAX_EXTENT_SIZE SZ_128M
-- 
cgit 


From 4300c58f809079951c87d84e5f11a2d265e3c9e7 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 14 Sep 2022 11:06:28 -0400
Subject: btrfs: move btrfs on-disk definitions out of ctree.h

The bulk of our on-disk definitions exist in btrfs_tree.h, which user
space can use.  Keep things consistent and move the rest of the on disk
definitions out of ctree.h into btrfs_tree.h.  Note I did have to update
all u8's to __u8, but otherwise this is a strict copy and paste.

Most of the definitions are mainly for internal use and are not
guaranteed stable public API and may change as we need. Compilation
failures by user applications can happen.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ reformat comments, style fixups ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h | 215 +------------------------------------------------------
 1 file changed, 1 insertion(+), 214 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7760cc07288e..7a80f9380146 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -55,8 +55,6 @@ struct btrfs_balance_control;
 struct btrfs_delayed_root;
 struct reloc_control;
 
-#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
-
 /*
  * Maximum number of mirrors that can be available for all profiles counting
  * the target device of dev-replace as one. During an active device replace
@@ -68,8 +66,6 @@ struct reloc_control;
  */
 #define BTRFS_MAX_MIRRORS (4 + 1)
 
-#define BTRFS_MAX_LEVEL 8
-
 #define BTRFS_OLDEST_GENERATION	0ULL
 
 /*
@@ -138,81 +134,9 @@ enum {
 	BTRFS_FS_STATE_COUNT
 };
 
-#define BTRFS_BACKREF_REV_MAX		256
-#define BTRFS_BACKREF_REV_SHIFT		56
-#define BTRFS_BACKREF_REV_MASK		(((u64)BTRFS_BACKREF_REV_MAX - 1) << \
-					 BTRFS_BACKREF_REV_SHIFT)
-
-#define BTRFS_OLD_BACKREF_REV		0
-#define BTRFS_MIXED_BACKREF_REV		1
-
-/*
- * every tree block (leaf or node) starts with this header.
- */
-struct btrfs_header {
-	/* these first four must match the super block */
-	u8 csum[BTRFS_CSUM_SIZE];
-	u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
-	__le64 bytenr; /* which block this node is supposed to live in */
-	__le64 flags;
-
-	/* allowed to be different from the super from here on down */
-	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
-	__le64 generation;
-	__le64 owner;
-	__le32 nritems;
-	u8 level;
-} __attribute__ ((__packed__));
-
-/*
- * this is a very generous portion of the super block, giving us
- * room to translate 14 chunks with 3 stripes each.
- */
-#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
-
-/*
- * just in case we somehow lose the roots and are not able to mount,
- * we store an array of the roots from previous transactions
- * in the super.
- */
-#define BTRFS_NUM_BACKUP_ROOTS 4
-struct btrfs_root_backup {
-	__le64 tree_root;
-	__le64 tree_root_gen;
-
-	__le64 chunk_root;
-	__le64 chunk_root_gen;
-
-	__le64 extent_root;
-	__le64 extent_root_gen;
-
-	__le64 fs_root;
-	__le64 fs_root_gen;
-
-	__le64 dev_root;
-	__le64 dev_root_gen;
-
-	__le64 csum_root;
-	__le64 csum_root_gen;
-
-	__le64 total_bytes;
-	__le64 bytes_used;
-	__le64 num_devices;
-	/* future */
-	__le64 unused_64[4];
-
-	u8 tree_root_level;
-	u8 chunk_root_level;
-	u8 extent_root_level;
-	u8 fs_root_level;
-	u8 dev_root_level;
-	u8 csum_root_level;
-	/* future and to align */
-	u8 unused_8[10];
-} __attribute__ ((__packed__));
-
 #define BTRFS_SUPER_INFO_OFFSET			SZ_64K
 #define BTRFS_SUPER_INFO_SIZE			4096
+static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
 
 /*
  * The reserved space at the beginning of each device.
@@ -221,69 +145,6 @@ struct btrfs_root_backup {
  */
 #define BTRFS_DEVICE_RANGE_RESERVED			(SZ_1M)
 
-/*
- * the super block basically lists the main trees of the FS
- * it currently lacks any block count etc etc
- */
-struct btrfs_super_block {
-	/* the first 4 fields must match struct btrfs_header */
-	u8 csum[BTRFS_CSUM_SIZE];
-	/* FS specific UUID, visible to user */
-	u8 fsid[BTRFS_FSID_SIZE];
-	__le64 bytenr; /* this block number */
-	__le64 flags;
-
-	/* allowed to be different from the btrfs_header from here own down */
-	__le64 magic;
-	__le64 generation;
-	__le64 root;
-	__le64 chunk_root;
-	__le64 log_root;
-
-	/*
-	 * This member has never been utilized since the very beginning, thus
-	 * it's always 0 regardless of kernel version.  We always use
-	 * generation + 1 to read log tree root.  So here we mark it deprecated.
-	 */
-	__le64 __unused_log_root_transid;
-	__le64 total_bytes;
-	__le64 bytes_used;
-	__le64 root_dir_objectid;
-	__le64 num_devices;
-	__le32 sectorsize;
-	__le32 nodesize;
-	__le32 __unused_leafsize;
-	__le32 stripesize;
-	__le32 sys_chunk_array_size;
-	__le64 chunk_root_generation;
-	__le64 compat_flags;
-	__le64 compat_ro_flags;
-	__le64 incompat_flags;
-	__le16 csum_type;
-	u8 root_level;
-	u8 chunk_root_level;
-	u8 log_root_level;
-	struct btrfs_dev_item dev_item;
-
-	char label[BTRFS_LABEL_SIZE];
-
-	__le64 cache_generation;
-	__le64 uuid_tree_generation;
-
-	/* the UUID written into btree blocks */
-	u8 metadata_uuid[BTRFS_FSID_SIZE];
-
-	/* future expansion */
-	u8 reserved8[8];
-	__le64 reserved[27];
-	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
-	struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
-
-	/* Padded to 4096 bytes */
-	u8 padding[565];
-} __attribute__ ((__packed__));
-static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
-
 /*
  * Compat flags that we support.  If any incompat flags are set other than the
  * ones specified below then we will fail to mount
@@ -341,43 +202,6 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
 	(BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
 #define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR		0ULL
 
-/*
- * A leaf is full of items. offset and size tell us where to find
- * the item in the leaf (relative to the start of the data area)
- */
-struct btrfs_item {
-	struct btrfs_disk_key key;
-	__le32 offset;
-	__le32 size;
-} __attribute__ ((__packed__));
-
-/*
- * leaves have an item area and a data area:
- * [item0, item1....itemN] [free space] [dataN...data1, data0]
- *
- * The data is separate from the items to get the keys closer together
- * during searches.
- */
-struct btrfs_leaf {
-	struct btrfs_header header;
-	struct btrfs_item items[];
-} __attribute__ ((__packed__));
-
-/*
- * all non-leaf blocks are nodes, they hold only keys and pointers to
- * other blocks
- */
-struct btrfs_key_ptr {
-	struct btrfs_disk_key key;
-	__le64 blockptr;
-	__le64 generation;
-} __attribute__ ((__packed__));
-
-struct btrfs_node {
-	struct btrfs_header header;
-	struct btrfs_key_ptr ptrs[];
-} __attribute__ ((__packed__));
-
 /* Read ahead values for struct btrfs_path.reada */
 enum {
 	READA_NONE,
@@ -1650,43 +1474,6 @@ do {									\
 #define btrfs_clear_pending(info, opt)	\
 	clear_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
 
-/*
- * Inode flags
- */
-#define BTRFS_INODE_NODATASUM		(1U << 0)
-#define BTRFS_INODE_NODATACOW		(1U << 1)
-#define BTRFS_INODE_READONLY		(1U << 2)
-#define BTRFS_INODE_NOCOMPRESS		(1U << 3)
-#define BTRFS_INODE_PREALLOC		(1U << 4)
-#define BTRFS_INODE_SYNC		(1U << 5)
-#define BTRFS_INODE_IMMUTABLE		(1U << 6)
-#define BTRFS_INODE_APPEND		(1U << 7)
-#define BTRFS_INODE_NODUMP		(1U << 8)
-#define BTRFS_INODE_NOATIME		(1U << 9)
-#define BTRFS_INODE_DIRSYNC		(1U << 10)
-#define BTRFS_INODE_COMPRESS		(1U << 11)
-
-#define BTRFS_INODE_ROOT_ITEM_INIT	(1U << 31)
-
-#define BTRFS_INODE_FLAG_MASK						\
-	(BTRFS_INODE_NODATASUM |					\
-	 BTRFS_INODE_NODATACOW |					\
-	 BTRFS_INODE_READONLY |						\
-	 BTRFS_INODE_NOCOMPRESS |					\
-	 BTRFS_INODE_PREALLOC |						\
-	 BTRFS_INODE_SYNC |						\
-	 BTRFS_INODE_IMMUTABLE |					\
-	 BTRFS_INODE_APPEND |						\
-	 BTRFS_INODE_NODUMP |						\
-	 BTRFS_INODE_NOATIME |						\
-	 BTRFS_INODE_DIRSYNC |						\
-	 BTRFS_INODE_COMPRESS |						\
-	 BTRFS_INODE_ROOT_ITEM_INIT)
-
-#define BTRFS_INODE_RO_VERITY		(1U << 0)
-
-#define BTRFS_INODE_RO_FLAG_MASK	(BTRFS_INODE_RO_VERITY)
-
 struct btrfs_map_token {
 	struct extent_buffer *eb;
 	char *kaddr;
-- 
cgit 


From 51129b33d3911c7a36e643d47cf7c00fba3089fe Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 14 Sep 2022 11:06:29 -0400
Subject: btrfs: move btrfs_get_block_group helper out of disk-io.h

This inline helper calls btrfs_fs_compat_ro(), which is defined in
another header.  To avoid weird header dependency problems move this
helper into disk-io.c with the rest of the global root helpers.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 7 +++++++
 fs/btrfs/disk-io.h | 8 +-------
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3460eaa9e4aa..5705335c80c8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1167,6 +1167,13 @@ struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr)
 	return btrfs_global_root(fs_info, &key);
 }
 
+struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
+{
+	if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE))
+		return fs_info->block_group_root;
+	return btrfs_extent_root(fs_info, 0);
+}
+
 struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
 				     u64 objectid)
 {
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 9fa923e005a3..da040ec327ae 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -75,6 +75,7 @@ struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info,
 				     struct btrfs_key *key);
 struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr);
 struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr);
+struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info);
 
 void btrfs_free_fs_info(struct btrfs_fs_info *fs_info);
 int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
@@ -106,13 +107,6 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)
 	return NULL;
 }
 
-static inline struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
-{
-	if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE))
-		return fs_info->block_group_root;
-	return btrfs_extent_root(fs_info, 0);
-}
-
 void btrfs_put_root(struct btrfs_root *root);
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
-- 
cgit 


From ad4b63caf56d0dc08e03966172d685ff9ebad996 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 14 Sep 2022 11:06:30 -0400
Subject: btrfs: move maximum limits to btrfs_tree.h

We have maximum link and name length limits, move these to btrfs_tree.h
as they're on disk limitations.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ reformat comments ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7a80f9380146..6a37df0fa0eb 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -68,19 +68,6 @@ struct reloc_control;
 
 #define BTRFS_OLDEST_GENERATION	0ULL
 
-/*
- * we can actually store much bigger names, but lets not confuse the rest
- * of linux
- */
-#define BTRFS_NAME_LEN 255
-
-/*
- * Theoretical limit is larger, but we keep this down to a sane
- * value. That should limit greatly the possibility of collisions on
- * inode ref items.
- */
-#define BTRFS_LINK_MAX 65535U
-
 #define BTRFS_EMPTY_DIR_SIZE 0
 
 #define BTRFS_DIRTY_METADATA_THRESH	SZ_32M
-- 
cgit 


From ed4c491a3db2e85b3eb04ac61f0723fc2ec5f50a Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 14 Sep 2022 11:06:31 -0400
Subject: btrfs: move BTRFS_MAX_MIRRORS into scrub.c

This is only used locally in scrub.c, move it out of ctree.h into
scrub.c.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h | 11 -----------
 fs/btrfs/scrub.c | 11 +++++++++++
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6a37df0fa0eb..ea83c178a60c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -55,17 +55,6 @@ struct btrfs_balance_control;
 struct btrfs_delayed_root;
 struct reloc_control;
 
-/*
- * Maximum number of mirrors that can be available for all profiles counting
- * the target device of dev-replace as one. During an active device replace
- * procedure, the target device of the copy operation is a mirror for the
- * filesystem data as well that can be used to read data in order to repair
- * read errors on other disks.
- *
- * Current value is derived from RAID1C4 with 4 copies.
- */
-#define BTRFS_MAX_MIRRORS (4 + 1)
-
 #define BTRFS_OLDEST_GENERATION	0ULL
 
 #define BTRFS_EMPTY_DIR_SIZE 0
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 196c4c6ed1ed..cdda4b2f20f2 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -56,6 +56,17 @@ struct scrub_ctx;
 
 #define SCRUB_MAX_PAGES			(DIV_ROUND_UP(BTRFS_MAX_METADATA_BLOCKSIZE, PAGE_SIZE))
 
+/*
+ * Maximum number of mirrors that can be available for all profiles counting
+ * the target device of dev-replace as one. During an active device replace
+ * procedure, the target device of the copy operation is a mirror for the
+ * filesystem data as well that can be used to read data in order to repair
+ * read errors on other disks.
+ *
+ * Current value is derived from RAID1C4 with 4 copies.
+ */
+#define BTRFS_MAX_MIRRORS (4 + 1)
+
 struct scrub_recover {
 	refcount_t		refs;
 	struct btrfs_io_context	*bioc;
-- 
cgit 


From 390d89ccf672ec79b84037d4af834b2275ff7cb9 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 14 Sep 2022 11:06:32 -0400
Subject: btrfs: move discard stat defs to free-space-cache.h

These definitions are used for discard statistics, move them out of
ctree.h and put them in free-space-cache.h.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h            | 9 ---------
 fs/btrfs/free-space-cache.h | 9 +++++++++
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ea83c178a60c..03d665b21244 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -63,15 +63,6 @@ struct reloc_control;
 
 #define BTRFS_MAX_EXTENT_SIZE SZ_128M
 
-/*
- * Deltas are an effective way to populate global statistics.  Give macro names
- * to make it clear what we're doing.  An example is discard_extents in
- * btrfs_free_space_ctl.
- */
-#define BTRFS_STAT_NR_ENTRIES	2
-#define BTRFS_STAT_CURR		0
-#define BTRFS_STAT_PREV		1
-
 static inline unsigned long btrfs_chunk_item_size(int num_stripes)
 {
 	BUG_ON(num_stripes == 0);
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 6d419ba53e95..eaf30f6444dd 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -43,6 +43,15 @@ static inline bool btrfs_free_space_trimming_bitmap(
 	return (info->trim_state == BTRFS_TRIM_STATE_TRIMMING);
 }
 
+/*
+ * Deltas are an effective way to populate global statistics.  Give macro names
+ * to make it clear what we're doing.  An example is discard_extents in
+ * btrfs_free_space_ctl.
+ */
+#define BTRFS_STAT_NR_ENTRIES	2
+#define BTRFS_STAT_CURR		0
+#define BTRFS_STAT_PREV		1
+
 struct btrfs_free_space_ctl {
 	spinlock_t tree_lock;
 	struct rb_root free_space_offset;
-- 
cgit 


From 06d61cb101f346338b50be5cd4f053a169b59d5e Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 14 Sep 2022 11:06:34 -0400
Subject: btrfs: move btrfs_should_fragment_free_space into block-group.c

This function uses functions that are not defined in block-group.h, move
it into block-group.c in order to keep the header clean.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c | 12 ++++++++++++
 fs/btrfs/block-group.h | 11 +----------
 2 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index deebc8ddbd93..3f8b1cbbbc43 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -18,6 +18,18 @@
 #include "raid56.h"
 #include "zoned.h"
 
+#ifdef CONFIG_BTRFS_DEBUG
+int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group)
+{
+	struct btrfs_fs_info *fs_info = block_group->fs_info;
+
+	return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) &&
+		block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
+	       (btrfs_test_opt(fs_info, FRAGMENT_DATA) &&
+		block_group->flags &  BTRFS_BLOCK_GROUP_DATA);
+}
+#endif
+
 /*
  * Return target flags in extended format or 0 if restripe for this chunk_type
  * is not in progress
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 8fb14b99a1d1..39e79bed10ae 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -251,16 +251,7 @@ static inline bool btrfs_is_block_group_data_only(
 }
 
 #ifdef CONFIG_BTRFS_DEBUG
-static inline int btrfs_should_fragment_free_space(
-		struct btrfs_block_group *block_group)
-{
-	struct btrfs_fs_info *fs_info = block_group->fs_info;
-
-	return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) &&
-		block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
-	       (btrfs_test_opt(fs_info, FRAGMENT_DATA) &&
-		block_group->flags &  BTRFS_BLOCK_GROUP_DATA);
-}
+int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group);
 #endif
 
 struct btrfs_block_group *btrfs_lookup_first_block_group(
-- 
cgit 


From f1e5c6185ca166cde0c7c2eeeab5d233ef315140 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 14 Sep 2022 11:06:35 -0400
Subject: btrfs: move flush related definitions to space-info.h

This code is used in space-info.c, move the definitions to space-info.h.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h         | 59 ------------------------------------------------
 fs/btrfs/delayed-inode.c |  1 +
 fs/btrfs/inode-item.c    |  1 +
 fs/btrfs/props.c         |  1 +
 fs/btrfs/relocation.c    |  1 +
 fs/btrfs/space-info.h    | 59 ++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 63 insertions(+), 59 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 03d665b21244..39974f4b441f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2639,65 +2639,6 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 
 void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
 
-/*
- * Different levels for to flush space when doing space reservations.
- *
- * The higher the level, the more methods we try to reclaim space.
- */
-enum btrfs_reserve_flush_enum {
-	/* If we are in the transaction, we can't flush anything.*/
-	BTRFS_RESERVE_NO_FLUSH,
-
-	/*
-	 * Flush space by:
-	 * - Running delayed inode items
-	 * - Allocating a new chunk
-	 */
-	BTRFS_RESERVE_FLUSH_LIMIT,
-
-	/*
-	 * Flush space by:
-	 * - Running delayed inode items
-	 * - Running delayed refs
-	 * - Running delalloc and waiting for ordered extents
-	 * - Allocating a new chunk
-	 */
-	BTRFS_RESERVE_FLUSH_EVICT,
-
-	/*
-	 * Flush space by above mentioned methods and by:
-	 * - Running delayed iputs
-	 * - Committing transaction
-	 *
-	 * Can be interrupted by a fatal signal.
-	 */
-	BTRFS_RESERVE_FLUSH_DATA,
-	BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE,
-	BTRFS_RESERVE_FLUSH_ALL,
-
-	/*
-	 * Pretty much the same as FLUSH_ALL, but can also steal space from
-	 * global rsv.
-	 *
-	 * Can be interrupted by a fatal signal.
-	 */
-	BTRFS_RESERVE_FLUSH_ALL_STEAL,
-};
-
-enum btrfs_flush_state {
-	FLUSH_DELAYED_ITEMS_NR	=	1,
-	FLUSH_DELAYED_ITEMS	=	2,
-	FLUSH_DELAYED_REFS_NR	=	3,
-	FLUSH_DELAYED_REFS	=	4,
-	FLUSH_DELALLOC		=	5,
-	FLUSH_DELALLOC_WAIT	=	6,
-	FLUSH_DELALLOC_FULL	=	7,
-	ALLOC_CHUNK		=	8,
-	ALLOC_CHUNK_FORCE	=	9,
-	RUN_DELAYED_IPUTS	=	10,
-	COMMIT_TRANS		=	11,
-};
-
 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
 				     struct btrfs_block_rsv *rsv,
 				     int nitems, bool use_global_rsv);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index cac5169eaf8d..a411f04a7b97 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -14,6 +14,7 @@
 #include "qgroup.h"
 #include "locking.h"
 #include "inode-item.h"
+#include "space-info.h"
 
 #define BTRFS_DELAYED_WRITEBACK		512
 #define BTRFS_DELAYED_BACKGROUND	128
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 0eeb5ea87894..366f3a788c6a 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -8,6 +8,7 @@
 #include "disk-io.h"
 #include "transaction.h"
 #include "print-tree.h"
+#include "space-info.h"
 
 struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
 						   int slot, const char *name,
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 055a631276ce..07f62e3ba6a5 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -10,6 +10,7 @@
 #include "ctree.h"
 #include "xattr.h"
 #include "compression.h"
+#include "space-info.h"
 
 #define BTRFS_PROP_HANDLERS_HT_BITS 8
 static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 9dcf9b39c798..216a4485d914 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -27,6 +27,7 @@
 #include "subpage.h"
 #include "zoned.h"
 #include "inode-item.h"
+#include "space-info.h"
 
 /*
  * Relocation overview
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index ce66023a9eb8..7e17bb803436 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -5,6 +5,65 @@
 
 #include "volumes.h"
 
+/*
+ * Different levels for to flush space when doing space reservations.
+ *
+ * The higher the level, the more methods we try to reclaim space.
+ */
+enum btrfs_reserve_flush_enum {
+	/* If we are in the transaction, we can't flush anything.*/
+	BTRFS_RESERVE_NO_FLUSH,
+
+	/*
+	 * Flush space by:
+	 * - Running delayed inode items
+	 * - Allocating a new chunk
+	 */
+	BTRFS_RESERVE_FLUSH_LIMIT,
+
+	/*
+	 * Flush space by:
+	 * - Running delayed inode items
+	 * - Running delayed refs
+	 * - Running delalloc and waiting for ordered extents
+	 * - Allocating a new chunk
+	 */
+	BTRFS_RESERVE_FLUSH_EVICT,
+
+	/*
+	 * Flush space by above mentioned methods and by:
+	 * - Running delayed iputs
+	 * - Committing transaction
+	 *
+	 * Can be interrupted by a fatal signal.
+	 */
+	BTRFS_RESERVE_FLUSH_DATA,
+	BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE,
+	BTRFS_RESERVE_FLUSH_ALL,
+
+	/*
+	 * Pretty much the same as FLUSH_ALL, but can also steal space from
+	 * global rsv.
+	 *
+	 * Can be interrupted by a fatal signal.
+	 */
+	BTRFS_RESERVE_FLUSH_ALL_STEAL,
+};
+
+enum btrfs_flush_state {
+	FLUSH_DELAYED_ITEMS_NR	= 1,
+	FLUSH_DELAYED_ITEMS	= 2,
+	FLUSH_DELAYED_REFS_NR	= 3,
+	FLUSH_DELAYED_REFS	= 4,
+	FLUSH_DELALLOC		= 5,
+	FLUSH_DELALLOC_WAIT	= 6,
+	FLUSH_DELALLOC_FULL	= 7,
+	ALLOC_CHUNK		= 8,
+	ALLOC_CHUNK_FORCE	= 9,
+	RUN_DELAYED_IPUTS	= 10,
+	COMMIT_TRANS		= 11,
+};
+
 struct btrfs_space_info {
 	spinlock_t lock;
 
-- 
cgit 


From f60acad355cf14ccccf420e6ea0ddd6de87cb210 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 14 Sep 2022 11:06:36 -0400
Subject: btrfs: move btrfs_print_data_csum_error into inode.c

This isn't used outside of inode.c, there's no reason to define it in
btrfs_inode.h. Drop the inline and add __cold as it's for errors that
are not in any hot path.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h | 26 --------------------------
 fs/btrfs/inode.c       | 26 ++++++++++++++++++++++++++
 2 files changed, 26 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 54c2ccb36b61..530a0ebfab3f 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -410,30 +410,4 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags,
 /* Array of bytes with variable length, hexadecimal format 0x1234 */
 #define CSUM_FMT				"0x%*phN"
 #define CSUM_FMT_VALUE(size, bytes)		size, bytes
-
-static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode,
-		u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
-{
-	struct btrfs_root *root = inode->root;
-	const u32 csum_size = root->fs_info->csum_size;
-
-	/* Output minus objectid, which is more meaningful */
-	if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID)
-		btrfs_warn_rl(root->fs_info,
-"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
-			root->root_key.objectid, btrfs_ino(inode),
-			logical_start,
-			CSUM_FMT_VALUE(csum_size, csum),
-			CSUM_FMT_VALUE(csum_size, csum_expected),
-			mirror_num);
-	else
-		btrfs_warn_rl(root->fs_info,
-"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
-			root->root_key.objectid, btrfs_ino(inode),
-			logical_start,
-			CSUM_FMT_VALUE(csum_size, csum),
-			CSUM_FMT_VALUE(csum_size, csum_expected),
-			mirror_num);
-}
-
 #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2ba2d8b9cefc..8cae730bd5ba 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -125,6 +125,32 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
 				       u64 ram_bytes, int compress_type,
 				       int type);
 
+static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
+		u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
+{
+	struct btrfs_root *root = inode->root;
+	const u32 csum_size = root->fs_info->csum_size;
+
+	/* Output without objectid, which is more meaningful */
+	if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) {
+		btrfs_warn_rl(root->fs_info,
+"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
+			root->root_key.objectid, btrfs_ino(inode),
+			logical_start,
+			CSUM_FMT_VALUE(csum_size, csum),
+			CSUM_FMT_VALUE(csum_size, csum_expected),
+			mirror_num);
+	} else {
+		btrfs_warn_rl(root->fs_info,
+"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
+			root->root_key.objectid, btrfs_ino(inode),
+			logical_start,
+			CSUM_FMT_VALUE(csum_size, csum),
+			CSUM_FMT_VALUE(csum_size, csum_expected),
+			mirror_num);
+	}
+}
+
 /*
  * btrfs_inode_lock - lock inode i_rwsem based on arguments passed
  *
-- 
cgit 


From 956504a331a613814279b04f9b0d663c7c2bb9bc Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 14 Sep 2022 11:06:37 -0400
Subject: btrfs: move trans_handle_cachep out of ctree.h

This is local to the transaction code, remove it from ctree.h and
inode.c, create new helpers in the transaction to handle the init work
and move the cachep locally to transaction.c.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h       |  1 -
 fs/btrfs/inode.c       |  8 --------
 fs/btrfs/super.c       |  9 ++++++++-
 fs/btrfs/transaction.c | 17 +++++++++++++++++
 fs/btrfs/transaction.h |  3 +++
 5 files changed, 28 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 39974f4b441f..fec3d618bcef 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -41,7 +41,6 @@ struct btrfs_pending_snapshot;
 struct btrfs_delayed_ref_root;
 struct btrfs_space_info;
 struct btrfs_block_group;
-extern struct kmem_cache *btrfs_trans_handle_cachep;
 extern struct kmem_cache *btrfs_path_cachep;
 extern struct kmem_cache *btrfs_free_space_cachep;
 extern struct kmem_cache *btrfs_free_space_bitmap_cachep;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8cae730bd5ba..54f50784aefa 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -107,7 +107,6 @@ static const struct address_space_operations btrfs_aops;
 static const struct file_operations btrfs_dir_file_operations;
 
 static struct kmem_cache *btrfs_inode_cachep;
-struct kmem_cache *btrfs_trans_handle_cachep;
 struct kmem_cache *btrfs_path_cachep;
 struct kmem_cache *btrfs_free_space_cachep;
 struct kmem_cache *btrfs_free_space_bitmap_cachep;
@@ -8926,7 +8925,6 @@ void __cold btrfs_destroy_cachep(void)
 	rcu_barrier();
 	bioset_exit(&btrfs_dio_bioset);
 	kmem_cache_destroy(btrfs_inode_cachep);
-	kmem_cache_destroy(btrfs_trans_handle_cachep);
 	kmem_cache_destroy(btrfs_path_cachep);
 	kmem_cache_destroy(btrfs_free_space_cachep);
 	kmem_cache_destroy(btrfs_free_space_bitmap_cachep);
@@ -8941,12 +8939,6 @@ int __init btrfs_init_cachep(void)
 	if (!btrfs_inode_cachep)
 		goto fail;
 
-	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
-			sizeof(struct btrfs_trans_handle), 0,
-			SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
-	if (!btrfs_trans_handle_cachep)
-		goto fail;
-
 	btrfs_path_cachep = kmem_cache_create("btrfs_path",
 			sizeof(struct btrfs_path), 0,
 			SLAB_MEM_SPREAD, NULL);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 5942b9384088..8fe2fdb167a7 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2739,10 +2739,14 @@ static int __init init_btrfs_fs(void)
 	if (err)
 		goto free_compress;
 
-	err = extent_state_init_cachep();
+	err = btrfs_transaction_init();
 	if (err)
 		goto free_cachep;
 
+	err = extent_state_init_cachep();
+	if (err)
+		goto free_transaction;
+
 	err = extent_buffer_init_cachep();
 	if (err)
 		goto free_extent_cachep;
@@ -2811,6 +2815,8 @@ free_eb_cachep:
 	extent_buffer_free_cachep();
 free_extent_cachep:
 	extent_state_free_cachep();
+free_transaction:
+	btrfs_transaction_exit();
 free_cachep:
 	btrfs_destroy_cachep();
 free_compress:
@@ -2822,6 +2828,7 @@ free_compress:
 
 static void __exit exit_btrfs_fs(void)
 {
+	btrfs_transaction_exit();
 	btrfs_destroy_cachep();
 	btrfs_delayed_ref_exit();
 	btrfs_auto_defrag_exit();
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index d1f1da6820fb..ae7d4aca771d 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -24,6 +24,8 @@
 #include "space-info.h"
 #include "zoned.h"
 
+static struct kmem_cache *btrfs_trans_handle_cachep;
+
 #define BTRFS_ROOT_TRANS_TAG 0
 
 /*
@@ -2600,3 +2602,18 @@ void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info)
 		btrfs_warn(fs_info,
 			"unknown pending changes left 0x%lx, ignoring", prev);
 }
+
+int __init btrfs_transaction_init(void)
+{
+	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
+			sizeof(struct btrfs_trans_handle), 0,
+			SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
+	if (!btrfs_trans_handle_cachep)
+		return -ENOMEM;
+	return 0;
+}
+
+void __cold btrfs_transaction_exit(void)
+{
+	kmem_cache_destroy(btrfs_trans_handle_cachep);
+}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 970ff316069d..1332f193b800 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -236,4 +236,7 @@ void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root);
 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
 
+int __init btrfs_transaction_init(void);
+void __cold btrfs_transaction_exit(void);
+
 #endif
-- 
cgit 


From 226463d7b100d30def24f2be492fef0081003a30 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 14 Sep 2022 11:06:38 -0400
Subject: btrfs: move btrfs_path_cachep out of ctree.h

This is local to the ctree code, remove it from ctree.h and inode.c,
create new init/exit functions for the cachep, and move it locally to
ctree.c.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 17 +++++++++++++++++
 fs/btrfs/ctree.h |  3 ++-
 fs/btrfs/inode.c |  8 --------
 fs/btrfs/super.c |  9 ++++++++-
 4 files changed, 27 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index dcb510f38dda..4f2d367e1207 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -18,6 +18,8 @@
 #include "tree-mod-log.h"
 #include "tree-checker.h"
 
+static struct kmem_cache *btrfs_path_cachep;
+
 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_path *path, int level);
 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -4933,3 +4935,18 @@ int btrfs_previous_extent_item(struct btrfs_root *root,
 	}
 	return 1;
 }
+
+int __init btrfs_ctree_init(void)
+{
+	btrfs_path_cachep = kmem_cache_create("btrfs_path",
+			sizeof(struct btrfs_path), 0,
+			SLAB_MEM_SPREAD, NULL);
+	if (!btrfs_path_cachep)
+		return -ENOMEM;
+	return 0;
+}
+
+void __cold btrfs_ctree_exit(void)
+{
+	kmem_cache_destroy(btrfs_path_cachep);
+}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index fec3d618bcef..282571b54b6d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -41,7 +41,6 @@ struct btrfs_pending_snapshot;
 struct btrfs_delayed_ref_root;
 struct btrfs_space_info;
 struct btrfs_block_group;
-extern struct kmem_cache *btrfs_path_cachep;
 extern struct kmem_cache *btrfs_free_space_cachep;
 extern struct kmem_cache *btrfs_free_space_bitmap_cachep;
 struct btrfs_ordered_sum;
@@ -2662,6 +2661,8 @@ void btrfs_end_write_no_snapshotting(struct btrfs_root *root);
 void btrfs_wait_for_snapshot_creation(struct btrfs_root *root);
 
 /* ctree.c */
+int __init btrfs_ctree_init(void);
+void __cold btrfs_ctree_exit(void);
 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
 		     int *slot);
 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 54f50784aefa..39c474e8a312 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -107,7 +107,6 @@ static const struct address_space_operations btrfs_aops;
 static const struct file_operations btrfs_dir_file_operations;
 
 static struct kmem_cache *btrfs_inode_cachep;
-struct kmem_cache *btrfs_path_cachep;
 struct kmem_cache *btrfs_free_space_cachep;
 struct kmem_cache *btrfs_free_space_bitmap_cachep;
 
@@ -8925,7 +8924,6 @@ void __cold btrfs_destroy_cachep(void)
 	rcu_barrier();
 	bioset_exit(&btrfs_dio_bioset);
 	kmem_cache_destroy(btrfs_inode_cachep);
-	kmem_cache_destroy(btrfs_path_cachep);
 	kmem_cache_destroy(btrfs_free_space_cachep);
 	kmem_cache_destroy(btrfs_free_space_bitmap_cachep);
 }
@@ -8939,12 +8937,6 @@ int __init btrfs_init_cachep(void)
 	if (!btrfs_inode_cachep)
 		goto fail;
 
-	btrfs_path_cachep = kmem_cache_create("btrfs_path",
-			sizeof(struct btrfs_path), 0,
-			SLAB_MEM_SPREAD, NULL);
-	if (!btrfs_path_cachep)
-		goto fail;
-
 	btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
 			sizeof(struct btrfs_free_space), 0,
 			SLAB_MEM_SPREAD, NULL);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8fe2fdb167a7..9f8b77553de0 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2743,10 +2743,14 @@ static int __init init_btrfs_fs(void)
 	if (err)
 		goto free_cachep;
 
-	err = extent_state_init_cachep();
+	err = btrfs_ctree_init();
 	if (err)
 		goto free_transaction;
 
+	err = extent_state_init_cachep();
+	if (err)
+		goto free_ctree;
+
 	err = extent_buffer_init_cachep();
 	if (err)
 		goto free_extent_cachep;
@@ -2815,6 +2819,8 @@ free_eb_cachep:
 	extent_buffer_free_cachep();
 free_extent_cachep:
 	extent_state_free_cachep();
+free_ctree:
+	btrfs_ctree_exit();
 free_transaction:
 	btrfs_transaction_exit();
 free_cachep:
@@ -2828,6 +2834,7 @@ free_compress:
 
 static void __exit exit_btrfs_fs(void)
 {
+	btrfs_ctree_exit();
 	btrfs_transaction_exit();
 	btrfs_destroy_cachep();
 	btrfs_delayed_ref_exit();
-- 
cgit 


From eda517fd0ceee0329bb956f701c0e124fc1fe269 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 14 Sep 2022 11:06:39 -0400
Subject: btrfs: move free space cachep's out of ctree.h

This is local to the free-space-cache.c code, remove it from ctree.h and
inode.c, create new init/exit functions for the cachep, and move it
locally to free-space-cache.c.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h            |  2 --
 fs/btrfs/free-space-cache.c | 28 ++++++++++++++++++++++++++++
 fs/btrfs/free-space-cache.h |  2 ++
 fs/btrfs/inode.c            | 16 ----------------
 fs/btrfs/super.c            |  9 ++++++++-
 5 files changed, 38 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 282571b54b6d..ca5c67b2c025 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -41,8 +41,6 @@ struct btrfs_pending_snapshot;
 struct btrfs_delayed_ref_root;
 struct btrfs_space_info;
 struct btrfs_block_group;
-extern struct kmem_cache *btrfs_free_space_cachep;
-extern struct kmem_cache *btrfs_free_space_bitmap_cachep;
 struct btrfs_ordered_sum;
 struct btrfs_ref;
 struct btrfs_bio;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index f4023651dd68..bd68fafcccc7 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -29,6 +29,9 @@
 #define MAX_CACHE_BYTES_PER_GIG	SZ_64K
 #define FORCE_EXTENT_THRESHOLD	SZ_1M
 
+static struct kmem_cache *btrfs_free_space_cachep;
+static struct kmem_cache *btrfs_free_space_bitmap_cachep;
+
 struct btrfs_trim_range {
 	u64 start;
 	u64 bytes;
@@ -4132,6 +4135,31 @@ out:
 	return ret;
 }
 
+int __init btrfs_free_space_init(void)
+{
+	btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
+			sizeof(struct btrfs_free_space), 0,
+			SLAB_MEM_SPREAD, NULL);
+	if (!btrfs_free_space_cachep)
+		return -ENOMEM;
+
+	btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap",
+							PAGE_SIZE, PAGE_SIZE,
+							SLAB_MEM_SPREAD, NULL);
+	if (!btrfs_free_space_bitmap_cachep) {
+		kmem_cache_destroy(btrfs_free_space_cachep);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void __cold btrfs_free_space_exit(void)
+{
+	kmem_cache_destroy(btrfs_free_space_cachep);
+	kmem_cache_destroy(btrfs_free_space_bitmap_cachep);
+}
+
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 /*
  * Use this if you need to make a bitmap or extent entry specifically, it
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index eaf30f6444dd..cab954a9d97b 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -88,6 +88,8 @@ struct btrfs_io_ctl {
 	int bitmaps;
 };
 
+int __init btrfs_free_space_init(void);
+void __cold btrfs_free_space_exit(void);
 struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
 		struct btrfs_path *path);
 int create_free_space_inode(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 39c474e8a312..50584b93a66f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -107,8 +107,6 @@ static const struct address_space_operations btrfs_aops;
 static const struct file_operations btrfs_dir_file_operations;
 
 static struct kmem_cache *btrfs_inode_cachep;
-struct kmem_cache *btrfs_free_space_cachep;
-struct kmem_cache *btrfs_free_space_bitmap_cachep;
 
 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
 static int btrfs_truncate(struct inode *inode, bool skip_writeback);
@@ -8924,8 +8922,6 @@ void __cold btrfs_destroy_cachep(void)
 	rcu_barrier();
 	bioset_exit(&btrfs_dio_bioset);
 	kmem_cache_destroy(btrfs_inode_cachep);
-	kmem_cache_destroy(btrfs_free_space_cachep);
-	kmem_cache_destroy(btrfs_free_space_bitmap_cachep);
 }
 
 int __init btrfs_init_cachep(void)
@@ -8937,18 +8933,6 @@ int __init btrfs_init_cachep(void)
 	if (!btrfs_inode_cachep)
 		goto fail;
 
-	btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
-			sizeof(struct btrfs_free_space), 0,
-			SLAB_MEM_SPREAD, NULL);
-	if (!btrfs_free_space_cachep)
-		goto fail;
-
-	btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap",
-							PAGE_SIZE, PAGE_SIZE,
-							SLAB_MEM_SPREAD, NULL);
-	if (!btrfs_free_space_bitmap_cachep)
-		goto fail;
-
 	if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
 			offsetof(struct btrfs_dio_private, bio),
 			BIOSET_NEED_BVECS))
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9f8b77553de0..0a93fbd29494 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2747,10 +2747,14 @@ static int __init init_btrfs_fs(void)
 	if (err)
 		goto free_transaction;
 
-	err = extent_state_init_cachep();
+	err = btrfs_free_space_init();
 	if (err)
 		goto free_ctree;
 
+	err = extent_state_init_cachep();
+	if (err)
+		goto free_free_space;
+
 	err = extent_buffer_init_cachep();
 	if (err)
 		goto free_extent_cachep;
@@ -2819,6 +2823,8 @@ free_eb_cachep:
 	extent_buffer_free_cachep();
 free_extent_cachep:
 	extent_state_free_cachep();
+free_free_space:
+	btrfs_free_space_exit();
 free_ctree:
 	btrfs_ctree_exit();
 free_transaction:
@@ -2834,6 +2840,7 @@ free_compress:
 
 static void __exit exit_btrfs_fs(void)
 {
+	btrfs_free_space_exit();
 	btrfs_ctree_exit();
 	btrfs_transaction_exit();
 	btrfs_destroy_cachep();
-- 
cgit 


From 890d2b1aa38b9101503194855aeee897ba6cdbc2 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 14 Sep 2022 11:06:40 -0400
Subject: btrfs: move btrfs_next_old_item into ctree.c

This uses btrfs_header_nritems, which I will be moving out of ctree.h.
In order to avoid needing to include the relevant header in ctree.h,
simply move this helper function into ctree.c.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ rename parameters ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 8 ++++++++
 fs/btrfs/ctree.h | 9 +--------
 2 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 4f2d367e1207..5f461e05cc9c 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -4852,6 +4852,14 @@ done:
 	return ret;
 }
 
+int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq)
+{
+	path->slots[0]++;
+	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
+		return btrfs_next_old_leaf(root, path, time_seq);
+	return 0;
+}
+
 /*
  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
  * searching until it gets past min_objectid or finds an item of 'type'
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ca5c67b2c025..c8ced88f2143 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2821,14 +2821,7 @@ int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
 		(path)->slots[0]++						\
 	)
 
-static inline int btrfs_next_old_item(struct btrfs_root *root,
-				      struct btrfs_path *p, u64 time_seq)
-{
-	++p->slots[0];
-	if (p->slots[0] >= btrfs_header_nritems(p->nodes[0]))
-		return btrfs_next_old_leaf(root, p, time_seq);
-	return 0;
-}
+int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq);
 
 /*
  * Search the tree again to find a leaf with greater keys.
-- 
cgit 


From 7a66eda351bacf2f88801204cfa5319dda3ec75f Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 14 Sep 2022 11:06:41 -0400
Subject: btrfs: move the btrfs_verity_descriptor_item defs up in ctree.h

These are wrapped in CONFIG_FS_VERITY, but we can have the definitions
without verity enabled.  Move these definitions up with the other
accessor helpers.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c8ced88f2143..c0c0dc8fa99f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2523,6 +2523,16 @@ BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left,
 BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
 			 struct btrfs_dev_replace_item, cursor_right, 64);
 
+/* btrfs_verity_descriptor_item */
+BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item,
+		   encryption, 8);
+BTRFS_SETGET_FUNCS(verity_descriptor_size, struct btrfs_verity_descriptor_item,
+		   size, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption,
+			 struct btrfs_verity_descriptor_item, encryption, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size,
+			 struct btrfs_verity_descriptor_item, size, 64);
+
 /* helper function to cast into the data area of the leaf. */
 #define btrfs_item_ptr(leaf, slot, type) \
 	((type *)(BTRFS_LEAF_DATA_OFFSET + \
@@ -3720,15 +3730,6 @@ extern const struct fsverity_operations btrfs_verityops;
 int btrfs_drop_verity_items(struct btrfs_inode *inode);
 int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size);
 
-BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item,
-		   encryption, 8);
-BTRFS_SETGET_FUNCS(verity_descriptor_size, struct btrfs_verity_descriptor_item,
-		   size, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption,
-			 struct btrfs_verity_descriptor_item, encryption, 8);
-BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size,
-			 struct btrfs_verity_descriptor_item, size, 64);
-
 #else
 
 static inline int btrfs_drop_verity_items(struct btrfs_inode *inode)
-- 
cgit 


From 765c3fe99bcda005d66c159f9a46fc2e0c77c8ce Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 9 Sep 2022 09:35:01 -0400
Subject: btrfs: introduce BTRFS_RESERVE_FLUSH_EMERGENCY

Inside of FB, as well as some user reports, we've had a consistent
problem of occasional ENOSPC transaction aborts.  Inside FB we were
seeing ~100-200 ENOSPC aborts per day in the fleet, which is a really
low occurrence rate given the size of our fleet, but it's not nothing.

There are two causes of this particular problem.

First is delayed allocation.  The reservation system for delalloc
assumes that contiguous dirty ranges will result in 1 file extent item.
However if there is memory pressure that results in fragmented writeout,
or there is fragmentation in the block groups, this won't necessarily be
true.  Consider the case where we do a single 256MiB write to a file and
then close it.  We will have 1 reservation for the inode update, the
reservations for the checksum updates, and 1 reservation for the file
extent item.  At some point later we decide to write this entire range
out, but we're so fragmented that we break this into 100 different file
extents.  Since we've already closed the file and are no longer writing
to it there's nothing to trigger a refill of the delalloc block rsv to
satisfy the 99 new file extent reservations we need.  At this point we
exhaust our delalloc reservation, and we begin to steal from the global
reserve.  If you have enough of these cases going in parallel you can
easily exhaust the global reserve, get an ENOSPC at
btrfs_alloc_tree_block() time, and then abort the transaction.

The other case is the delayed refs reserve.  The delayed refs reserve
updates its size based on outstanding delayed refs and dirty block
groups.  However we only refill this block reserve when returning
excess reservations and when we call btrfs_start_transaction(root, X).
We will reserve 2*X credits at transaction start time, and fill in X
into the delayed refs reserve to make sure it stays topped off.
Generally this works well, but clearly has downsides.  If we do a
particularly delayed ref heavy operation we may never catch up in our
reservations.  Additionally running delayed refs generates more delayed
refs, and at that point we may be committing the transaction and have no
way to trigger a refill of our delayed refs rsv.  Then a similar thing
occurs with the delalloc reserve.

Generally speaking we well over-reserve in all of our block rsvs.  If we
reserve 1 credit we're usually reserving around 264k of space, but we'll
often not use any of that reservation, or use a few blocks of that
reservation.  We can be reasonably sure that as long as you were able to
reserve space up front for your operation you'll be able to find space
on disk for that reservation.

So introduce a new flushing state, BTRFS_RESERVE_FLUSH_EMERGENCY.  This
gets used in the case that we've exhausted our reserve and the global
reserve.  It simply forces a reservation if we have enough actual space
on disk to make the reservation, which is almost always the case.  This
keeps us from hitting ENOSPC aborts in these odd occurrences where we've
not kept up with the delayed work.

Fixing this in a complete way is going to be relatively complicated and
time consuming.  This patch is what I discussed with Filipe earlier this
year, and what I put into our kernels inside FB.  With this patch we're
down to 1-2 ENOSPC aborts per week, which is a significant reduction.
This is a decent stop gap until we can work out a more wholistic
solution to these two corner cases.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-rsv.c  | 12 ++++++++++++
 fs/btrfs/space-info.c | 29 +++++++++++++++++++++++++++--
 fs/btrfs/space-info.h | 18 ++++++++++++++++++
 3 files changed, 57 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index ec96285357e0..89e3e7d1bff6 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -552,5 +552,17 @@ try_reserve:
 		if (!ret)
 			return global_rsv;
 	}
+
+	/*
+	 * All hope is lost, but of course our reservations are overly
+	 * pessimistic, so instead of possibly having an ENOSPC abort here, try
+	 * one last time to force a reservation if there's enough actual space
+	 * on disk to make the reservation.
+	 */
+	ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize,
+					   BTRFS_RESERVE_FLUSH_EMERGENCY);
+	if (!ret)
+		return block_rsv;
+
 	return ERR_PTR(ret);
 }
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index f171bf875633..af2e133aaa5c 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -1583,6 +1583,16 @@ static inline bool can_steal(enum btrfs_reserve_flush_enum flush)
 		flush == BTRFS_RESERVE_FLUSH_EVICT);
 }
 
+/*
+ * NO_FLUSH and FLUSH_EMERGENCY don't want to create a ticket, they just want to
+ * fail as quickly as possible.
+ */
+static inline bool can_ticket(enum btrfs_reserve_flush_enum flush)
+{
+	return (flush != BTRFS_RESERVE_NO_FLUSH &&
+		flush != BTRFS_RESERVE_FLUSH_EMERGENCY);
+}
+
 /**
  * Try to reserve bytes from the block_rsv's space
  *
@@ -1644,6 +1654,21 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
 		ret = 0;
 	}
 
+	/*
+	 * Things are dire, we need to make a reservation so we don't abort.  We
+	 * will let this reservation go through as long as we have actual space
+	 * left to allocate for the block.
+	 */
+	if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) {
+		used = btrfs_space_info_used(space_info, false);
+		if (used + orig_bytes <=
+		    writable_total_bytes(fs_info, space_info)) {
+			btrfs_space_info_update_bytes_may_use(fs_info, space_info,
+							      orig_bytes);
+			ret = 0;
+		}
+	}
+
 	/*
 	 * If we couldn't make a reservation then setup our reservation ticket
 	 * and kick the async worker if it's not already running.
@@ -1651,7 +1676,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
 	 * If we are a priority flusher then we just need to add our ticket to
 	 * the list and we will do our own flushing further down.
 	 */
-	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
+	if (ret && can_ticket(flush)) {
 		ticket.bytes = orig_bytes;
 		ticket.error = 0;
 		space_info->reclaim_size += ticket.bytes;
@@ -1701,7 +1726,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
 		}
 	}
 	spin_unlock(&space_info->lock);
-	if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
+	if (!ret || !can_ticket(flush))
 		return ret;
 
 	return handle_reserve_ticket(fs_info, space_info, &ticket, start_ns,
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index 7e17bb803436..f28bd2c051d4 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -48,6 +48,24 @@ enum btrfs_reserve_flush_enum {
 	 * Can be interrupted by a fatal signal.
 	 */
 	BTRFS_RESERVE_FLUSH_ALL_STEAL,
+
+	/*
+	 * This is for btrfs_use_block_rsv only.  We have exhausted our block
+	 * rsv and our global block rsv.  This can happen for things like
+	 * delalloc where we are overwriting a lot of extents with a single
+	 * extent and didn't reserve enough space.  Alternatively it can happen
+	 * with delalloc where we reserve 1 extents worth for a large extent but
+	 * fragmentation leads to multiple extents being created.  This will
+	 * give us the reservation in the case of
+	 *
+	 * if (num_bytes < (space_info->total_bytes -
+	 *		    btrfs_space_info_used(space_info, false))
+	 *
+	 * Which ignores bytes_may_use.  This is potentially dangerous, but our
+	 * reservation system is generally pessimistic so is able to absorb this
+	 * style of mistake.
+	 */
+	BTRFS_RESERVE_FLUSH_EMERGENCY,
 };
 
 enum btrfs_flush_state {
-- 
cgit 


From ff2b64a22a2efcc087520e94ad06b005268a5f9d Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 10 Oct 2022 18:36:08 +0800
Subject: btrfs: raid56: cleanup for function __free_raid_bio()

The cleanup involves two things:

- Remove the "__" prefix
  There is no naming confliction.

- Remove the forward declaration
  There is no special function call involved.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c | 61 +++++++++++++++++++++++++++----------------------------
 1 file changed, 30 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 82c8e991300e..371b2a182544 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -69,7 +69,6 @@ static void rmw_work(struct work_struct *work);
 static void read_rebuild_work(struct work_struct *work);
 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
-static void __free_raid_bio(struct btrfs_raid_bio *rbio);
 static void index_rbio_pages(struct btrfs_raid_bio *rbio);
 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
 
@@ -77,6 +76,28 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
 					 int need_check);
 static void scrub_parity_work(struct work_struct *work);
 
+static void free_raid_bio(struct btrfs_raid_bio *rbio)
+{
+	int i;
+
+	if (!refcount_dec_and_test(&rbio->refs))
+		return;
+
+	WARN_ON(!list_empty(&rbio->stripe_cache));
+	WARN_ON(!list_empty(&rbio->hash_list));
+	WARN_ON(!bio_list_empty(&rbio->bio_list));
+
+	for (i = 0; i < rbio->nr_pages; i++) {
+		if (rbio->stripe_pages[i]) {
+			__free_page(rbio->stripe_pages[i]);
+			rbio->stripe_pages[i] = NULL;
+		}
+	}
+
+	btrfs_put_bioc(rbio->bioc);
+	kfree(rbio);
+}
+
 static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func)
 {
 	INIT_WORK(&rbio->work, work_func);
@@ -336,7 +357,7 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
 	spin_unlock(&h->lock);
 
 	if (freeit)
-		__free_raid_bio(rbio);
+		free_raid_bio(rbio);
 }
 
 /*
@@ -684,7 +705,7 @@ out:
 	if (cache_drop)
 		remove_rbio_from_cache(cache_drop);
 	if (freeit)
-		__free_raid_bio(freeit);
+		free_raid_bio(freeit);
 	return ret;
 }
 
@@ -769,28 +790,6 @@ done_nolock:
 		remove_rbio_from_cache(rbio);
 }
 
-static void __free_raid_bio(struct btrfs_raid_bio *rbio)
-{
-	int i;
-
-	if (!refcount_dec_and_test(&rbio->refs))
-		return;
-
-	WARN_ON(!list_empty(&rbio->stripe_cache));
-	WARN_ON(!list_empty(&rbio->hash_list));
-	WARN_ON(!bio_list_empty(&rbio->bio_list));
-
-	for (i = 0; i < rbio->nr_pages; i++) {
-		if (rbio->stripe_pages[i]) {
-			__free_page(rbio->stripe_pages[i]);
-			rbio->stripe_pages[i] = NULL;
-		}
-	}
-
-	btrfs_put_bioc(rbio->bioc);
-	kfree(rbio);
-}
-
 static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
 {
 	struct bio *next;
@@ -830,7 +829,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
 	 */
 	unlock_stripe(rbio);
 	extra = bio_list_get(&rbio->bio_list);
-	__free_raid_bio(rbio);
+	free_raid_bio(rbio);
 
 	rbio_endio_bio_list(cur, err);
 	if (extra)
@@ -1731,7 +1730,7 @@ static void run_plug(struct btrfs_plug_cb *plug)
 		if (last) {
 			if (rbio_can_merge(last, cur)) {
 				merge_rbio(last, cur);
-				__free_raid_bio(cur);
+				free_raid_bio(cur);
 				continue;
 
 			}
@@ -1822,7 +1821,7 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
 	if (rbio_is_full(rbio)) {
 		ret = full_stripe_write(rbio);
 		if (ret) {
-			__free_raid_bio(rbio);
+			free_raid_bio(rbio);
 			goto fail;
 		}
 		return;
@@ -1839,7 +1838,7 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
 	} else {
 		ret = __raid56_parity_write(rbio);
 		if (ret) {
-			__free_raid_bio(rbio);
+			free_raid_bio(rbio);
 			goto fail;
 		}
 	}
@@ -2214,7 +2213,7 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
 "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)",
 			   __func__, bio->bi_iter.bi_sector << 9,
 			   (u64)bio->bi_iter.bi_size, bioc->map_type);
-		__free_raid_bio(rbio);
+		free_raid_bio(rbio);
 		bio->bi_status = BLK_STS_IOERR;
 		goto out_end_bio;
 	}
@@ -2747,7 +2746,7 @@ raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc)
 		btrfs_warn_rl(fs_info,
 	"can not determine the failed stripe number for full stripe %llu",
 			      bioc->raid_map[0]);
-		__free_raid_bio(rbio);
+		free_raid_bio(rbio);
 		return NULL;
 	}
 
-- 
cgit 


From 797d74b749850a5b81c47560caec26e00e7e3768 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 10 Oct 2022 18:36:09 +0800
Subject: btrfs: raid56: allocate memory separately for rbio pointers

Currently inside alloc_rbio(), we allocate a larger memory to contain
the following members:

- struct btrfs_raid_rbio itself
- stripe_pages array
- bio_sectors array
- stripe_sectors array
- finish_pointers array

Then update rbio pointers to point the extra space after the rbio
structure itself.

Thus it introduced a complex CONSUME_ALLOC() macro to help the thing.

This is too hacky, and is going to make later pointers expansion harder.

This patch will change it to use regular kcalloc() for each pointer
inside btrfs_raid_bio, making the later expansion much easier.

And introduce a helper free_raid_bio_pointers() to free up all the
pointer members in btrfs_raid_bio, which will be used in both
free_raid_bio() and error path of alloc_rbio().

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c | 46 ++++++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 371b2a182544..4ec211a58f15 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -76,6 +76,14 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
 					 int need_check);
 static void scrub_parity_work(struct work_struct *work);
 
+static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio)
+{
+	kfree(rbio->stripe_pages);
+	kfree(rbio->bio_sectors);
+	kfree(rbio->stripe_sectors);
+	kfree(rbio->finish_pointers);
+}
+
 static void free_raid_bio(struct btrfs_raid_bio *rbio)
 {
 	int i;
@@ -95,6 +103,7 @@ static void free_raid_bio(struct btrfs_raid_bio *rbio)
 	}
 
 	btrfs_put_bioc(rbio->bioc);
+	free_raid_bio_pointers(rbio);
 	kfree(rbio);
 }
 
@@ -918,7 +927,6 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
 		BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
 	const unsigned int num_sectors = stripe_nsectors * real_stripes;
 	struct btrfs_raid_bio *rbio;
-	void *p;
 
 	/* PAGE_SIZE must also be aligned to sectorsize for subpage support */
 	ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize));
@@ -928,14 +936,23 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
 	 */
 	ASSERT(stripe_nsectors <= BITS_PER_LONG);
 
-	rbio = kzalloc(sizeof(*rbio) +
-		       sizeof(*rbio->stripe_pages) * num_pages +
-		       sizeof(*rbio->bio_sectors) * num_sectors +
-		       sizeof(*rbio->stripe_sectors) * num_sectors +
-		       sizeof(*rbio->finish_pointers) * real_stripes,
-		       GFP_NOFS);
+	rbio = kzalloc(sizeof(*rbio), GFP_NOFS);
 	if (!rbio)
 		return ERR_PTR(-ENOMEM);
+	rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *),
+				     GFP_NOFS);
+	rbio->bio_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr),
+				    GFP_NOFS);
+	rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr),
+				       GFP_NOFS);
+	rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS);
+
+	if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors ||
+	    !rbio->finish_pointers) {
+		free_raid_bio_pointers(rbio);
+		kfree(rbio);
+		return ERR_PTR(-ENOMEM);
+	}
 
 	bio_list_init(&rbio->bio_list);
 	INIT_LIST_HEAD(&rbio->plug_list);
@@ -955,21 +972,6 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
 	atomic_set(&rbio->error, 0);
 	atomic_set(&rbio->stripes_pending, 0);
 
-	/*
-	 * The stripe_pages, bio_sectors, etc arrays point to the extra memory
-	 * we allocated past the end of the rbio.
-	 */
-	p = rbio + 1;
-#define CONSUME_ALLOC(ptr, count)	do {				\
-		ptr = p;						\
-		p = (unsigned char *)p + sizeof(*(ptr)) * (count);	\
-	} while (0)
-	CONSUME_ALLOC(rbio->stripe_pages, num_pages);
-	CONSUME_ALLOC(rbio->bio_sectors, num_sectors);
-	CONSUME_ALLOC(rbio->stripe_sectors, num_sectors);
-	CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
-#undef  CONSUME_ALLOC
-
 	ASSERT(btrfs_nr_parity_stripes(bioc->map_type));
 	rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type);
 
-- 
cgit 


From 88074c8b1376ac315ef4a294db82d861df074ef2 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 10 Oct 2022 18:36:10 +0800
Subject: btrfs: raid56: make it more explicit that cache rbio should have all
 its data sectors uptodate

For Btrfs RAID56, we have a caching system for btrfs raid bios (rbio).

We call cache_rbio_pages() to mark a qualified rbio ready for cache.

The timing happens at:

- finish_rmw()

  At this timing, we have already read all necessary sectors, along with
  the rbio sectors, we have covered all data stripes.

- __raid_recover_end_io()

  At this timing, we have rebuild the rbio, thus all data sectors
  involved (either from stripe or bio list) are uptodate now.

Thus at the timing of cache_rbio_pages(), we should have all data
sectors uptodate.

This patch will make it explicit that all data sectors are uptodate at
cache_rbio_pages() timing, mostly to prepare for the incoming
verification at RMW time.

This patch will add:

- Extra ASSERT()s in cache_rbio_pages()
  This is to make sure all data sectors, which are not covered by bio,
  are already uptodate.

- Extra ASSERT()s in steal_rbio()
  Since only cached rbio can be stolen, thus every data sector should
  already be uptodate in the source rbio.

- Update __raid_recover_end_io() to update recovered sector->uptodate
  Previously __raid_recover_end_io() will only mark failed sectors
  uptodate if it's doing an RMW.

  But this can trigger new ASSERT()s, as for recovery case, a recovered
  failed sector will not be marked uptodate, and trigger ASSERT() in
  later cache_rbio_pages() call.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c | 70 ++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 51 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 4ec211a58f15..c009c0a2081e 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -176,8 +176,16 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
 
 	for (i = 0; i < rbio->nr_sectors; i++) {
 		/* Some range not covered by bio (partial write), skip it */
-		if (!rbio->bio_sectors[i].page)
+		if (!rbio->bio_sectors[i].page) {
+			/*
+			 * Even if the sector is not covered by bio, if it is
+			 * a data sector it should still be uptodate as it is
+			 * read from disk.
+			 */
+			if (i < rbio->nr_data * rbio->stripe_nsectors)
+				ASSERT(rbio->stripe_sectors[i].uptodate);
 			continue;
+		}
 
 		ASSERT(rbio->stripe_sectors[i].page);
 		memcpy_page(rbio->stripe_sectors[i].page,
@@ -264,6 +272,21 @@ static void steal_rbio_page(struct btrfs_raid_bio *src,
 		dest->stripe_sectors[i].uptodate = true;
 }
 
+static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr)
+{
+	const int sector_nr = (page_nr << PAGE_SHIFT) >>
+			      rbio->bioc->fs_info->sectorsize_bits;
+
+	/*
+	 * We have ensured PAGE_SIZE is aligned with sectorsize, thus
+	 * we won't have a page which is half data half parity.
+	 *
+	 * Thus if the first sector of the page belongs to data stripes, then
+	 * the full page belongs to data stripes.
+	 */
+	return (sector_nr < rbio->nr_data * rbio->stripe_nsectors);
+}
+
 /*
  * Stealing an rbio means taking all the uptodate pages from the stripe array
  * in the source rbio and putting them into the destination rbio.
@@ -274,16 +297,26 @@ static void steal_rbio_page(struct btrfs_raid_bio *src,
 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
 {
 	int i;
-	struct page *s;
 
 	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
 		return;
 
 	for (i = 0; i < dest->nr_pages; i++) {
-		s = src->stripe_pages[i];
-		if (!s || !full_page_sectors_uptodate(src, i))
+		struct page *p = src->stripe_pages[i];
+
+		/*
+		 * We don't need to steal P/Q pages as they will always be
+		 * regenerated for RMW or full write anyway.
+		 */
+		if (!is_data_stripe_page(src, i))
 			continue;
 
+		/*
+		 * If @src already has RBIO_CACHE_READY_BIT, it should have
+		 * all data stripe pages present and uptodate.
+		 */
+		ASSERT(p);
+		ASSERT(full_page_sectors_uptodate(src, i));
 		steal_rbio_page(src, dest, i);
 	}
 	index_stripe_sectors(dest);
@@ -2003,22 +2036,21 @@ pstripe:
 			/* xor in the rest */
 			run_xor(pointers, rbio->nr_data - 1, sectorsize);
 		}
-		/* if we're doing this rebuild as part of an rmw, go through
-		 * and set all of our private rbio pages in the
-		 * failed stripes as uptodate.  This way finish_rmw will
-		 * know they can be trusted.  If this was a read reconstruction,
-		 * other endio functions will fiddle the uptodate bits
+
+		/*
+		 * No matter if this is a RMW or recovery, we should have all
+		 * failed sectors repaired, thus they are now uptodate.
+		 * Especially if we determine to cache the rbio, we need to
+		 * have at least all data sectors uptodate.
 		 */
-		if (rbio->operation == BTRFS_RBIO_WRITE) {
-			for (i = 0;  i < rbio->stripe_nsectors; i++) {
-				if (faila != -1) {
-					sector = rbio_stripe_sector(rbio, faila, i);
-					sector->uptodate = 1;
-				}
-				if (failb != -1) {
-					sector = rbio_stripe_sector(rbio, failb, i);
-					sector->uptodate = 1;
-				}
+		for (i = 0;  i < rbio->stripe_nsectors; i++) {
+			if (faila != -1) {
+				sector = rbio_stripe_sector(rbio, faila, i);
+				sector->uptodate = 1;
+			}
+			if (failb != -1) {
+				sector = rbio_stripe_sector(rbio, failb, i);
+				sector->uptodate = 1;
 			}
 		}
 		for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--)
-- 
cgit 


From d47704bd1c78c85831561bcf701b90dd66f811b2 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 11 Oct 2022 13:16:54 +0100
Subject: btrfs: get the next extent map during fiemap/lseek more efficiently

At find_delalloc_subrange(), when we need to get the next extent map, we
do a full search on the extent map tree (a red black tree). This is fine
but it's a lot more efficient to simply use rb_next(), which typically
requires iterating over less nodes of the tree and never needs to compare
the ranges of nodes with the one we are looking for.

So add a public helper to extent_map.{h,c} to get the extent map that
immediately follows another extent map, using rb_next(), and use that
helper at find_delalloc_subrange().

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_map.c | 31 ++++++++++++++++++++++++++++++-
 fs/btrfs/extent_map.h |  2 ++
 fs/btrfs/file.c       | 44 +++++++++++++++++++++++++++-----------------
 3 files changed, 59 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 6092a4eedc92..715979807ae1 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -523,7 +523,7 @@ void replace_extent_mapping(struct extent_map_tree *tree,
 	setup_extent_mapping(tree, new, modified);
 }
 
-static struct extent_map *next_extent_map(struct extent_map *em)
+static struct extent_map *next_extent_map(const struct extent_map *em)
 {
 	struct rb_node *next;
 
@@ -533,6 +533,35 @@ static struct extent_map *next_extent_map(struct extent_map *em)
 	return container_of(next, struct extent_map, rb_node);
 }
 
+/*
+ * Get the extent map that immediately follows another one.
+ *
+ * @tree:       The extent map tree that the extent map belong to.
+ *              Holding read or write access on the tree's lock is required.
+ * @em:         An extent map from the given tree. The caller must ensure that
+ *              between getting @em and between calling this function, the
+ *              extent map @em is not removed from the tree - for example, by
+ *              holding the tree's lock for the duration of those 2 operations.
+ *
+ * Returns the extent map that immediately follows @em, or NULL if @em is the
+ * last extent map in the tree.
+ */
+struct extent_map *btrfs_next_extent_map(const struct extent_map_tree *tree,
+					 const struct extent_map *em)
+{
+	struct extent_map *next;
+
+	/* The lock must be acquired either in read mode or write mode. */
+	lockdep_assert_held(&tree->lock);
+	ASSERT(extent_map_in_tree(em));
+
+	next = next_extent_map(em);
+	if (next)
+		refcount_inc(&next->refs);
+
+	return next;
+}
+
 static struct extent_map *prev_extent_map(struct extent_map *em)
 {
 	struct rb_node *prev;
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index ad311864272a..68d3f2c9ea1d 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -87,6 +87,8 @@ static inline u64 extent_map_block_end(struct extent_map *em)
 void extent_map_tree_init(struct extent_map_tree *tree);
 struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
 					 u64 start, u64 len);
+struct extent_map *btrfs_next_extent_map(const struct extent_map_tree *tree,
+					 const struct extent_map *em);
 int add_extent_mapping(struct extent_map_tree *tree,
 		       struct extent_map *em, int modified);
 void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 493cae66e5e6..6685cb0e928c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3569,40 +3569,50 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end
 	 */
 	read_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, start, len);
-	read_unlock(&em_tree->lock);
+	if (!em) {
+		read_unlock(&em_tree->lock);
+		return (delalloc_len > 0);
+	}
 
 	/* extent_map_end() returns a non-inclusive end offset. */
-	em_end = em ? extent_map_end(em) : 0;
+	em_end = extent_map_end(em);
 
 	/*
 	 * If we have a hole/prealloc extent map, check the next one if this one
 	 * ends before our range's end.
 	 */
-	if (em && (em->block_start == EXTENT_MAP_HOLE ||
-		   test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) && em_end < end) {
+	if ((em->block_start == EXTENT_MAP_HOLE ||
+	     test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) && em_end < end) {
 		struct extent_map *next_em;
 
-		read_lock(&em_tree->lock);
-		next_em = lookup_extent_mapping(em_tree, em_end, len - em_end);
-		read_unlock(&em_tree->lock);
-
+		next_em = btrfs_next_extent_map(em_tree, em);
 		free_extent_map(em);
-		em_end = next_em ? extent_map_end(next_em) : 0;
+
+		/*
+		 * There's no next extent map or the next one starts beyond our
+		 * range, return the range found in the io tree (if any).
+		 */
+		if (!next_em || next_em->start > end) {
+			read_unlock(&em_tree->lock);
+			free_extent_map(next_em);
+			return (delalloc_len > 0);
+		}
+
+		em_end = extent_map_end(next_em);
 		em = next_em;
 	}
 
-	if (em && (em->block_start == EXTENT_MAP_HOLE ||
-		   test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
-		free_extent_map(em);
-		em = NULL;
-	}
+	read_unlock(&em_tree->lock);
 
 	/*
-	 * No extent map or one for a hole or prealloc extent. Use the delalloc
-	 * range we found in the io tree if we have one.
+	 * We have a hole or prealloc extent that ends at or beyond our range's
+	 * end, return the range found in the io tree (if any).
 	 */
-	if (!em)
+	if (em->block_start == EXTENT_MAP_HOLE ||
+	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+		free_extent_map(em);
 		return (delalloc_len > 0);
+	}
 
 	/*
 	 * We don't have any range as EXTENT_DELALLOC in the io tree, so the
-- 
cgit 


From 013f9c70d293d7a47aaaeaee248ca60f745fa450 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 11 Oct 2022 13:16:55 +0100
Subject: btrfs: skip unnecessary extent map searches during fiemap and lseek

If we have no outstanding extents it means we don't have any extent maps
corresponding to delalloc that is flushing, as when an ordered extent is
created we increment the number of outstanding extents to 1 and when we
remove the ordered extent we decrement them by 1. So skip extent map tree
searches if the number of outstanding ordered extents is 0, saving time as
the tree is not empty if we have previously made some reads or flushed
delalloc, as in those cases it can have a very large number of extent maps
for files with many extents.

This helps save time when processing a file range corresponding to a hole
or prealloc (unwritten) extent.

The next patch in the series has a performance test in its changelog and
its subject is:

    "btrfs: skip unnecessary delalloc search during fiemap and lseek"

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 6685cb0e928c..1cf97280760e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3553,6 +3553,18 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end
 	if (delalloc_len > 0)
 		*delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1;
 
+	spin_lock(&inode->lock);
+	if (inode->outstanding_extents == 0) {
+		/*
+		 * No outstanding extents means we don't have any delalloc that
+		 * is flushing, so return the unflushed range found in the io
+		 * tree (if any).
+		 */
+		spin_unlock(&inode->lock);
+		return (delalloc_len > 0);
+	}
+	spin_unlock(&inode->lock);
+
 	/*
 	 * Now also check if there's any extent map in the range that does not
 	 * map to a hole or prealloc extent. We do this because:
-- 
cgit 


From a2853ffc2eb9bfb6e6d48486df7f3969a58ae3b4 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 11 Oct 2022 13:16:56 +0100
Subject: btrfs: skip unnecessary delalloc search during fiemap and lseek

During fiemap and lseek (hole and data seeking), there's no point in
iterating the inode's io tree to count delalloc bits if the inode's
delalloc bytes counter has a value of zero, as that counter is updated
whenever we set a range for delalloc or clear a range from delalloc.

So skip the counting and io tree iteration if the inode's delalloc bytes
counter has a value of zero. This helps save time when processing a file
range corresponding to a hole or prealloc (unwritten) extent.

This patch is part of a series comprised of the following patches:

  btrfs: get the next extent map during fiemap/lseek more efficiently
  btrfs: skip unnecessary extent map searches during fiemap and lseek
  btrfs: skip unnecessary delalloc search during fiemap and lseek

The following test was performed on a release kernel (Debian's default
kernel config) before and after applying those 3 patches.

   # Wrapper to call fiemap in extent count only mode.
   # (struct fiemap::fm_extent_count set to 0)
   $ cat fiemap.c
   #include <stdio.h>
   #include <unistd.h>
   #include <stdlib.h>
   #include <fcntl.h>
   #include <errno.h>
   #include <string.h>
   #include <sys/ioctl.h>
   #include <linux/fs.h>
   #include <linux/fiemap.h>

   int main(int argc, char **argv)
   {
            struct fiemap fiemap = { 0 };
            int fd;

            if (argc != 2) {
                    printf("usage: %s <path>\n", argv[0]);
                    return 1;
            }
            fd = open(argv[1], O_RDONLY);
            if (fd < 0) {
                    fprintf(stderr, "error opening file: %s\n",
                            strerror(errno));
                    return 1;
            }

            /* fiemap.fm_extent_count set to 0, to count extents only. */
            fiemap.fm_length = FIEMAP_MAX_OFFSET;
            if (ioctl(fd, FS_IOC_FIEMAP, &fiemap) < 0) {
                    fprintf(stderr, "fiemap error: %s\n",
                            strerror(errno));
                    return 1;
            }
            close(fd);
            printf("fm_mapped_extents = %d\n", fiemap.fm_mapped_extents);

            return 0;
   }

   $ gcc -o fiemap fiemap.c

And the wrapper shell script that creates a file with many holes and runs
fiemap against it:

   $ cat test.sh
   #!/bin/bash

   DEV=/dev/sdi
   MNT=/mnt/sdi

   mkfs.btrfs -f $DEV
   mount $DEV $MNT

   FILE_SIZE=$((1 * 1024 * 1024 * 1024))
   echo -n > $MNT/foobar
   for ((off = 0; off < $FILE_SIZE; off += 8192)); do
           xfs_io -c "pwrite -S 0xab $off 4K" $MNT/foobar > /dev/null
   done

   # flush all delalloc
   sync

   start=$(date +%s%N)
   ./fiemap $MNT/foobar
   end=$(date +%s%N)
   dur=$(( (end - start) / 1000000 ))
   echo "fiemap took $dur milliseconds"

   umount $MNT

Result before applying patchset:

   fm_mapped_extents = 131072
   fiemap took 63 milliseconds

Result after applying patchset:

   fm_mapped_extents = 131072
   fiemap took 39 milliseconds   (-38.1%)

Running the same test for a 512M file instead of a 1G file, gave the
following results.

Result before applying patchset:

   fm_mapped_extents = 65536
   fiemap took 29 milliseconds

Result after applying patchset:

   fm_mapped_extents = 65536
   fiemap took 20 milliseconds    (-31.0%)

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1cf97280760e..19b41b5fe6c0 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3537,15 +3537,27 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end
 	struct extent_map *em;
 	u64 em_end;
 	u64 delalloc_len;
+	unsigned int outstanding_extents;
 
 	/*
 	 * Search the io tree first for EXTENT_DELALLOC. If we find any, it
 	 * means we have delalloc (dirty pages) for which writeback has not
 	 * started yet.
 	 */
-	*delalloc_start_ret = start;
-	delalloc_len = count_range_bits(&inode->io_tree, delalloc_start_ret, end,
-					len, EXTENT_DELALLOC, 1);
+	spin_lock(&inode->lock);
+	outstanding_extents = inode->outstanding_extents;
+
+	if (inode->delalloc_bytes > 0) {
+		spin_unlock(&inode->lock);
+		*delalloc_start_ret = start;
+		delalloc_len = count_range_bits(&inode->io_tree,
+						delalloc_start_ret, end,
+						len, EXTENT_DELALLOC, 1);
+	} else {
+		spin_unlock(&inode->lock);
+		delalloc_len = 0;
+	}
+
 	/*
 	 * If delalloc was found then *delalloc_start_ret has a sector size
 	 * aligned value (rounded down).
@@ -3553,17 +3565,12 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end
 	if (delalloc_len > 0)
 		*delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1;
 
-	spin_lock(&inode->lock);
-	if (inode->outstanding_extents == 0) {
-		/*
-		 * No outstanding extents means we don't have any delalloc that
-		 * is flushing, so return the unflushed range found in the io
-		 * tree (if any).
-		 */
-		spin_unlock(&inode->lock);
+	/*
+	 * No outstanding extents means we don't have any delalloc that is
+	 * flushing, so return the unflushed range found in the io tree (if any).
+	 */
+	if (outstanding_extents == 0)
 		return (delalloc_len > 0);
-	}
-	spin_unlock(&inode->lock);
 
 	/*
 	 * Now also check if there's any extent map in the range that does not
-- 
cgit 


From b98c6cd59e90fe659cf966859bc4e1c03aea347b Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 11 Oct 2022 13:16:57 +0100
Subject: btrfs: drop pointless memset when cloning extent buffer

At btrfs_clone_extent_buffer(), before allocating the pages array for the
new extent buffer we are calling memset() to zero out the pages array of
the extent buffer. This is pointless however, because the extent buffer
already has every element in its pages array pointing to NULL, as it was
allocated with kmem_cache_zalloc(). The memset() was introduced with
commit dd137dd1f2d719 ("btrfs: factor out allocating an array of pages"),
but even before that commit we already depended on the pages array being
initialized to NULL for the error paths that need to call
btrfs_release_extent_buffer().

So remove the memset(), it's useless and slightly increases the object
text size.

Before this change:

   $ size fs/btrfs/extent_io.o
      text	   data	    bss	    dec	    hex	filename
     70580	   5469	     40	  76089	  12939	fs/btrfs/extent_io.o

After this change:

   $ size fs/btrfs/extent_io.o
      text	   data	    bss	    dec	    hex	filename
     70564	   5469	     40	  76073	  12929	fs/btrfs/extent_io.o

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7891d375eb43..56a9f591f479 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4299,7 +4299,6 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
 	 */
 	set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
 
-	memset(new->pages, 0, sizeof(*new->pages) * num_pages);
 	ret = btrfs_alloc_page_array(num_pages, new->pages);
 	if (ret) {
 		btrfs_release_extent_buffer(new);
-- 
cgit 


From 206c1d32f381ee91ba849a7dcb28728e8c3721b6 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 11 Oct 2022 13:16:58 +0100
Subject: btrfs: drop redundant bflags initialization when allocating extent
 buffer

When allocating an extent buffer, at __alloc_extent_buffer(), there's no
point in explicitly assigning zero to the bflags field of the new extent
buffer because we allocated it with kmem_cache_zalloc().

So just remove the redundant initialization, it saves one mov instruction
in the generated assembly code for x86_64 ("movq $0x0,0x10(%rax)").

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 56a9f591f479..3192f094fe6c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4266,7 +4266,6 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
 	eb->start = start;
 	eb->len = len;
 	eb->fs_info = fs_info;
-	eb->bflags = 0;
 	init_rwsem(&eb->lock);
 
 	btrfs_leak_debug_add_eb(eb);
-- 
cgit 


From c902421927ff602954d2ecc7bd6e71b255d29387 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 11 Oct 2022 13:16:59 +0100
Subject: btrfs: remove checks for a root with id 0 during backref walking

When doing backref walking to determine if an extent is shared, we are
testing the root_objectid of the given share_check struct is 0, but that
is an impossible case, since btrfs_is_data_extent_shared() always
initializes the root_objectid field with the id of the given root, and
no root can have an objectid of 0. So remove those checks.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 18374a6d05bd..abac2e942e8b 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -719,8 +719,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 			continue;
 		}
 
-		if (sc && sc->root_objectid &&
-		    ref->root_id != sc->root_objectid) {
+		if (sc && ref->root_id != sc->root_objectid) {
 			free_pref(ref);
 			ret = BACKREF_FOUND_SHARED;
 			goto out;
@@ -1348,8 +1347,7 @@ again:
 		 * and would retain their original ref->count < 0.
 		 */
 		if (roots && ref->count && ref->root_id && ref->parent == 0) {
-			if (sc && sc->root_objectid &&
-			    ref->root_id != sc->root_objectid) {
+			if (sc && ref->root_id != sc->root_objectid) {
 				ret = BACKREF_FOUND_SHARED;
 				goto out;
 			}
-- 
cgit 


From a0a5472ad802d99d3fb4b361cc3fb5ea24914ee0 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 11 Oct 2022 13:17:00 +0100
Subject: btrfs: remove checks for a 0 inode number during backref walking

When doing backref walking to determine if an extent is shared, we are
testing if the inode number, stored in the 'inum' field of struct
share_check, is 0. However that can never be case, since the all instances
of the structure are created at btrfs_is_data_extent_shared(), which
always initializes it with the inode number from a fs tree (and the number
for any inode from any tree can never be 0). So remove the checks.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index abac2e942e8b..9c7b3e1e4762 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1051,7 +1051,7 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
 			key.type = BTRFS_EXTENT_DATA_KEY;
 			key.offset = btrfs_extent_data_ref_offset(leaf, dref);
 
-			if (sc && sc->inum && key.objectid != sc->inum &&
+			if (sc && key.objectid != sc->inum &&
 			    !sc->have_delayed_delete_refs) {
 				ret = BACKREF_FOUND_SHARED;
 				break;
@@ -1152,7 +1152,7 @@ static int add_keyed_refs(struct btrfs_root *extent_root,
 			key.type = BTRFS_EXTENT_DATA_KEY;
 			key.offset = btrfs_extent_data_ref_offset(leaf, dref);
 
-			if (sc && sc->inum && key.objectid != sc->inum &&
+			if (sc && key.objectid != sc->inum &&
 			    !sc->have_delayed_delete_refs) {
 				ret = BACKREF_FOUND_SHARED;
 				break;
-- 
cgit 


From ceb707da9ad92ad3a5251dc13844034ded06cb3d Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 11 Oct 2022 13:17:01 +0100
Subject: btrfs: directly pass the inode to btrfs_is_data_extent_shared()

Currently we pass a root and an inode number as arguments for
btrfs_is_data_extent_shared() and the inode number is always from an
inode that belongs to that root (it wouldn't make sense otherwise).
In every context that we call btrfs_is_data_extent_shared() (fiemap only),
we have an inode available, so directly pass the inode to the function
instead of a root and inode number. This reduces the number of parameters
and it makes the function's signature conform to most other functions we
have.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c   |  8 ++++----
 fs/btrfs/backref.h   |  2 +-
 fs/btrfs/extent_io.c | 16 +++++++---------
 3 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 9c7b3e1e4762..efdeb69e8ed6 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1658,8 +1658,7 @@ static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache,
 /*
  * Check if a data extent is shared or not.
  *
- * @root:        The root the inode belongs to.
- * @inum:        Number of the inode whose extent we are checking.
+ * @inode:       The inode whose extent we are checking.
  * @bytenr:      Logical bytenr of the extent we are checking.
  * @extent_gen:  Generation of the extent (file extent item) or 0 if it is
  *               not known.
@@ -1678,11 +1677,12 @@ static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache,
  *
  * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error.
  */
-int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
+int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
 				u64 extent_gen,
 				struct ulist *roots, struct ulist *tmp,
 				struct btrfs_backref_shared_cache *cache)
 {
+	struct btrfs_root *root = inode->root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_trans_handle *trans;
 	struct ulist_iterator uiter;
@@ -1691,7 +1691,7 @@ int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
 	int ret = 0;
 	struct share_check shared = {
 		.root_objectid = root->root_key.objectid,
-		.inum = inum,
+		.inum = btrfs_ino(inode),
 		.share_count = 0,
 		.have_delayed_delete_refs = false,
 	};
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 8e69584d538d..c846fa2bdf93 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -77,7 +77,7 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
 			  u64 start_off, struct btrfs_path *path,
 			  struct btrfs_inode_extref **ret_extref,
 			  u64 *found_off);
-int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
+int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
 				u64 extent_gen,
 				struct ulist *roots, struct ulist *tmp,
 				struct btrfs_backref_shared_cache *cache);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3192f094fe6c..83be7e85ba91 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3712,7 +3712,6 @@ static int fiemap_process_hole(struct btrfs_inode *inode,
 			       u64 start, u64 end)
 {
 	const u64 i_size = i_size_read(&inode->vfs_inode);
-	const u64 ino = btrfs_ino(inode);
 	u64 cur_offset = start;
 	u64 last_delalloc_end = 0;
 	u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN;
@@ -3752,8 +3751,8 @@ static int fiemap_process_hole(struct btrfs_inode *inode,
 
 		if (prealloc_len > 0) {
 			if (!checked_extent_shared && fieinfo->fi_extents_max) {
-				ret = btrfs_is_data_extent_shared(inode->root,
-							  ino, disk_bytenr,
+				ret = btrfs_is_data_extent_shared(inode,
+							  disk_bytenr,
 							  extent_gen, roots,
 							  tmp_ulist,
 							  backref_cache);
@@ -3802,8 +3801,8 @@ static int fiemap_process_hole(struct btrfs_inode *inode,
 		}
 
 		if (!checked_extent_shared && fieinfo->fi_extents_max) {
-			ret = btrfs_is_data_extent_shared(inode->root,
-							  ino, disk_bytenr,
+			ret = btrfs_is_data_extent_shared(inode,
+							  disk_bytenr,
 							  extent_gen, roots,
 							  tmp_ulist,
 							  backref_cache);
@@ -3904,7 +3903,6 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
 	const u64 ino = btrfs_ino(inode);
 	struct extent_state *cached_state = NULL;
 	struct btrfs_path *path;
-	struct btrfs_root *root = inode->root;
 	struct fiemap_cache cache = { 0 };
 	struct btrfs_backref_shared_cache *backref_cache;
 	struct ulist *roots;
@@ -3925,8 +3923,8 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
 		goto out;
 	}
 
-	lockstart = round_down(start, root->fs_info->sectorsize);
-	lockend = round_up(start + len, root->fs_info->sectorsize);
+	lockstart = round_down(start, inode->root->fs_info->sectorsize);
+	lockend = round_up(start + len, inode->root->fs_info->sectorsize);
 	prev_extent_end = lockstart;
 
 	lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
@@ -4034,7 +4032,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
 		} else {
 			/* We have a regular extent. */
 			if (fieinfo->fi_extents_max) {
-				ret = btrfs_is_data_extent_shared(root, ino,
+				ret = btrfs_is_data_extent_shared(inode,
 								  disk_bytenr,
 								  extent_gen,
 								  roots,
-- 
cgit 


From 61dbb952f0a5f587c983d88853212e969d2d4ede Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 11 Oct 2022 13:17:02 +0100
Subject: btrfs: turn the backref sharedness check cache into a context object

Right now we are using a struct btrfs_backref_shared_cache to pass state
across multiple btrfs_is_data_extent_shared() calls. The structure's name
closely follows its current purpose, which is to cache previous checks
for the sharedness of metadata extents. However we will start using the
structure for more things other than caching sharedness checks, so rename
it to struct btrfs_backref_share_check_ctx.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c   | 32 ++++++++++++++++----------------
 fs/btrfs/backref.h   |  8 ++++----
 fs/btrfs/extent_io.c | 24 ++++++++++++------------
 3 files changed, 32 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index efdeb69e8ed6..693c99ad4afb 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1542,13 +1542,13 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
  * fs_info->commit_root_sem semaphore, so no need to worry about the root's last
  * snapshot field changing while updating or checking the cache.
  */
-static bool lookup_backref_shared_cache(struct btrfs_backref_shared_cache *cache,
+static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx,
 					struct btrfs_root *root,
 					u64 bytenr, int level, bool *is_shared)
 {
 	struct btrfs_backref_shared_cache_entry *entry;
 
-	if (!cache->use_cache)
+	if (!ctx->use_path_cache)
 		return false;
 
 	if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL))
@@ -1562,7 +1562,7 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_shared_cache *cache
 	 */
 	ASSERT(level >= 0);
 
-	entry = &cache->entries[level];
+	entry = &ctx->path_cache_entries[level];
 
 	/* Unused cache entry or being used for some other extent buffer. */
 	if (entry->bytenr != bytenr)
@@ -1595,8 +1595,8 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_shared_cache *cache
 	 */
 	if (*is_shared) {
 		for (int i = 0; i < level; i++) {
-			cache->entries[i].is_shared = true;
-			cache->entries[i].gen = entry->gen;
+			ctx->path_cache_entries[i].is_shared = true;
+			ctx->path_cache_entries[i].gen = entry->gen;
 		}
 	}
 
@@ -1608,14 +1608,14 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_shared_cache *cache
  * fs_info->commit_root_sem semaphore, so no need to worry about the root's last
  * snapshot field changing while updating or checking the cache.
  */
-static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache,
+static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx,
 				       struct btrfs_root *root,
 				       u64 bytenr, int level, bool is_shared)
 {
 	struct btrfs_backref_shared_cache_entry *entry;
 	u64 gen;
 
-	if (!cache->use_cache)
+	if (!ctx->use_path_cache)
 		return;
 
 	if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL))
@@ -1634,7 +1634,7 @@ static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache,
 	else
 		gen = btrfs_root_last_snapshot(&root->root_item);
 
-	entry = &cache->entries[level];
+	entry = &ctx->path_cache_entries[level];
 	entry->bytenr = bytenr;
 	entry->is_shared = is_shared;
 	entry->gen = gen;
@@ -1648,7 +1648,7 @@ static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache,
 	 */
 	if (is_shared) {
 		for (int i = 0; i < level; i++) {
-			entry = &cache->entries[i];
+			entry = &ctx->path_cache_entries[i];
 			entry->is_shared = is_shared;
 			entry->gen = gen;
 		}
@@ -1664,7 +1664,7 @@ static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache,
  *               not known.
  * @roots:       List of roots this extent is shared among.
  * @tmp:         Temporary list used for iteration.
- * @cache:       A backref lookup result cache.
+ * @ctx:         A backref sharedness check context.
  *
  * btrfs_is_data_extent_shared uses the backref walking code but will short
  * circuit as soon as it finds a root or inode that doesn't match the
@@ -1680,7 +1680,7 @@ static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache,
 int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
 				u64 extent_gen,
 				struct ulist *roots, struct ulist *tmp,
-				struct btrfs_backref_shared_cache *cache)
+				struct btrfs_backref_share_check_ctx *ctx)
 {
 	struct btrfs_root *root = inode->root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -1715,7 +1715,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
 	/* -1 means we are in the bytenr of the data extent. */
 	level = -1;
 	ULIST_ITER_INIT(&uiter);
-	cache->use_cache = true;
+	ctx->use_path_cache = true;
 	while (1) {
 		bool is_shared;
 		bool cached;
@@ -1726,7 +1726,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
 			/* this is the only condition under which we return 1 */
 			ret = 1;
 			if (level >= 0)
-				store_backref_shared_cache(cache, root, bytenr,
+				store_backref_shared_cache(ctx, root, bytenr,
 							   level, true);
 			break;
 		}
@@ -1761,17 +1761,17 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
 		 * (which implies multiple paths).
 		 */
 		if (level == -1 && tmp->nnodes > 1)
-			cache->use_cache = false;
+			ctx->use_path_cache = false;
 
 		if (level >= 0)
-			store_backref_shared_cache(cache, root, bytenr,
+			store_backref_shared_cache(ctx, root, bytenr,
 						   level, false);
 		node = ulist_next(tmp, &uiter);
 		if (!node)
 			break;
 		bytenr = node->val;
 		level++;
-		cached = lookup_backref_shared_cache(cache, root, bytenr, level,
+		cached = lookup_backref_shared_cache(ctx, root, bytenr, level,
 						     &is_shared);
 		if (cached) {
 			ret = (is_shared ? 1 : 0);
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index c846fa2bdf93..e3a2b45a76e3 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -23,13 +23,13 @@ struct btrfs_backref_shared_cache_entry {
 	bool is_shared;
 };
 
-struct btrfs_backref_shared_cache {
+struct btrfs_backref_share_check_ctx {
 	/*
 	 * A path from a root to a leaf that has a file extent item pointing to
 	 * a given data extent should never exceed the maximum b+tree height.
 	 */
-	struct btrfs_backref_shared_cache_entry entries[BTRFS_MAX_LEVEL];
-	bool use_cache;
+	struct btrfs_backref_shared_cache_entry path_cache_entries[BTRFS_MAX_LEVEL];
+	bool use_path_cache;
 };
 
 typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
@@ -80,7 +80,7 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
 int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
 				u64 extent_gen,
 				struct ulist *roots, struct ulist *tmp,
-				struct btrfs_backref_shared_cache *cache);
+				struct btrfs_backref_share_check_ctx *ctx);
 
 int __init btrfs_prelim_ref_init(void);
 void __cold btrfs_prelim_ref_exit(void);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 83be7e85ba91..365ad00a5942 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3705,7 +3705,7 @@ static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path
 static int fiemap_process_hole(struct btrfs_inode *inode,
 			       struct fiemap_extent_info *fieinfo,
 			       struct fiemap_cache *cache,
-			       struct btrfs_backref_shared_cache *backref_cache,
+			       struct btrfs_backref_share_check_ctx *backref_ctx,
 			       u64 disk_bytenr, u64 extent_offset,
 			       u64 extent_gen,
 			       struct ulist *roots, struct ulist *tmp_ulist,
@@ -3755,7 +3755,7 @@ static int fiemap_process_hole(struct btrfs_inode *inode,
 							  disk_bytenr,
 							  extent_gen, roots,
 							  tmp_ulist,
-							  backref_cache);
+							  backref_ctx);
 				if (ret < 0)
 					return ret;
 				else if (ret > 0)
@@ -3805,7 +3805,7 @@ static int fiemap_process_hole(struct btrfs_inode *inode,
 							  disk_bytenr,
 							  extent_gen, roots,
 							  tmp_ulist,
-							  backref_cache);
+							  backref_ctx);
 			if (ret < 0)
 				return ret;
 			else if (ret > 0)
@@ -3904,7 +3904,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
 	struct extent_state *cached_state = NULL;
 	struct btrfs_path *path;
 	struct fiemap_cache cache = { 0 };
-	struct btrfs_backref_shared_cache *backref_cache;
+	struct btrfs_backref_share_check_ctx *backref_ctx;
 	struct ulist *roots;
 	struct ulist *tmp_ulist;
 	u64 last_extent_end;
@@ -3914,11 +3914,11 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
 	bool stopped = false;
 	int ret;
 
-	backref_cache = kzalloc(sizeof(*backref_cache), GFP_KERNEL);
+	backref_ctx = kzalloc(sizeof(*backref_ctx), GFP_KERNEL);
 	path = btrfs_alloc_path();
 	roots = ulist_alloc(GFP_KERNEL);
 	tmp_ulist = ulist_alloc(GFP_KERNEL);
-	if (!backref_cache || !path || !roots || !tmp_ulist) {
+	if (!backref_ctx || !path || !roots || !tmp_ulist) {
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -3978,7 +3978,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
 			const u64 range_end = min(key.offset, lockend) - 1;
 
 			ret = fiemap_process_hole(inode, fieinfo, &cache,
-						  backref_cache, 0, 0, 0,
+						  backref_ctx, 0, 0, 0,
 						  roots, tmp_ulist,
 						  prev_extent_end, range_end);
 			if (ret < 0) {
@@ -4019,14 +4019,14 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
 						 extent_len, flags);
 		} else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
 			ret = fiemap_process_hole(inode, fieinfo, &cache,
-						  backref_cache,
+						  backref_ctx,
 						  disk_bytenr, extent_offset,
 						  extent_gen, roots, tmp_ulist,
 						  key.offset, extent_end - 1);
 		} else if (disk_bytenr == 0) {
 			/* We have an explicit hole. */
 			ret = fiemap_process_hole(inode, fieinfo, &cache,
-						  backref_cache, 0, 0, 0,
+						  backref_ctx, 0, 0, 0,
 						  roots, tmp_ulist,
 						  key.offset, extent_end - 1);
 		} else {
@@ -4037,7 +4037,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
 								  extent_gen,
 								  roots,
 								  tmp_ulist,
-								  backref_cache);
+								  backref_ctx);
 				if (ret < 0)
 					goto out_unlock;
 				else if (ret > 0)
@@ -4086,7 +4086,7 @@ check_eof_delalloc:
 	path = NULL;
 
 	if (!stopped && prev_extent_end < lockend) {
-		ret = fiemap_process_hole(inode, fieinfo, &cache, backref_cache,
+		ret = fiemap_process_hole(inode, fieinfo, &cache, backref_ctx,
 					  0, 0, 0, roots, tmp_ulist,
 					  prev_extent_end, lockend - 1);
 		if (ret < 0)
@@ -4119,7 +4119,7 @@ check_eof_delalloc:
 out_unlock:
 	unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
 out:
-	kfree(backref_cache);
+	kfree(backref_ctx);
 	btrfs_free_path(path);
 	ulist_free(roots);
 	ulist_free(tmp_ulist);
-- 
cgit 


From 84a7949d409753c90dc3477b8cfc18e983b09078 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 11 Oct 2022 13:17:03 +0100
Subject: btrfs: move ulists to data extent sharedness check context

When calling btrfs_is_data_extent_shared() we pass two ulists that were
allocated by the caller. This is because the single caller, fiemap, calls
btrfs_is_data_extent_shared() multiple times and the ulists can be reused,
instead of allocating new ones before each call and freeing them after
each call.

Now that we have a context structure/object that we pass to
btrfs_is_data_extent_shared(), we can move those ulists to it, and hide
their allocation and the context's allocation in a helper function, as
well as the freeing of the ulists and the context object. This allows to
reduce the number of parameters passed to btrfs_is_data_extent_shared(),
the need to pass the ulists from extent_fiemap() to fiemap_process_hole()
and having the caller deal with allocating and releasing the ulists.

Also rename one of the ulists from 'tmp' / 'tmp_ulist' to 'refs', since
that's a much better name as it reflects what the list is used for (and
matching the argument name for find_parent_nodes()).

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c   | 43 ++++++++++++++++++++++++++++++++-----------
 fs/btrfs/backref.h   |  7 ++++++-
 fs/btrfs/extent_io.c | 34 ++++++++++------------------------
 3 files changed, 48 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 693c99ad4afb..4caff3052da7 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1655,6 +1655,30 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx
 	}
 }
 
+struct btrfs_backref_share_check_ctx *btrfs_alloc_backref_share_check_ctx(void)
+{
+	struct btrfs_backref_share_check_ctx *ctx;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return NULL;
+
+	ulist_init(&ctx->refs);
+	ulist_init(&ctx->roots);
+
+	return ctx;
+}
+
+void btrfs_free_backref_share_ctx(struct btrfs_backref_share_check_ctx *ctx)
+{
+	if (!ctx)
+		return;
+
+	ulist_release(&ctx->refs);
+	ulist_release(&ctx->roots);
+	kfree(ctx);
+}
+
 /*
  * Check if a data extent is shared or not.
  *
@@ -1662,8 +1686,6 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx
  * @bytenr:      Logical bytenr of the extent we are checking.
  * @extent_gen:  Generation of the extent (file extent item) or 0 if it is
  *               not known.
- * @roots:       List of roots this extent is shared among.
- * @tmp:         Temporary list used for iteration.
  * @ctx:         A backref sharedness check context.
  *
  * btrfs_is_data_extent_shared uses the backref walking code but will short
@@ -1679,7 +1701,6 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx
  */
 int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
 				u64 extent_gen,
-				struct ulist *roots, struct ulist *tmp,
 				struct btrfs_backref_share_check_ctx *ctx)
 {
 	struct btrfs_root *root = inode->root;
@@ -1697,8 +1718,8 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
 	};
 	int level;
 
-	ulist_init(roots);
-	ulist_init(tmp);
+	ulist_init(&ctx->roots);
+	ulist_init(&ctx->refs);
 
 	trans = btrfs_join_transaction_nostart(root);
 	if (IS_ERR(trans)) {
@@ -1720,8 +1741,8 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
 		bool is_shared;
 		bool cached;
 
-		ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp,
-					roots, NULL, &shared, false);
+		ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, &ctx->refs,
+					&ctx->roots, NULL, &shared, false);
 		if (ret == BACKREF_FOUND_SHARED) {
 			/* this is the only condition under which we return 1 */
 			ret = 1;
@@ -1760,13 +1781,13 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
 		 * deal with), we can not use it if we have multiple leaves
 		 * (which implies multiple paths).
 		 */
-		if (level == -1 && tmp->nnodes > 1)
+		if (level == -1 && ctx->refs.nnodes > 1)
 			ctx->use_path_cache = false;
 
 		if (level >= 0)
 			store_backref_shared_cache(ctx, root, bytenr,
 						   level, false);
-		node = ulist_next(tmp, &uiter);
+		node = ulist_next(&ctx->refs, &uiter);
 		if (!node)
 			break;
 		bytenr = node->val;
@@ -1789,8 +1810,8 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
 		up_read(&fs_info->commit_root_sem);
 	}
 out:
-	ulist_release(roots);
-	ulist_release(tmp);
+	ulist_release(&ctx->roots);
+	ulist_release(&ctx->refs);
 	return ret;
 }
 
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index e3a2b45a76e3..8da0ba6b94a4 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -24,6 +24,9 @@ struct btrfs_backref_shared_cache_entry {
 };
 
 struct btrfs_backref_share_check_ctx {
+	/* Ulists used during backref walking. */
+	struct ulist refs;
+	struct ulist roots;
 	/*
 	 * A path from a root to a leaf that has a file extent item pointing to
 	 * a given data extent should never exceed the maximum b+tree height.
@@ -35,6 +38,9 @@ struct btrfs_backref_share_check_ctx {
 typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
 		void *ctx);
 
+struct btrfs_backref_share_check_ctx *btrfs_alloc_backref_share_check_ctx(void);
+void btrfs_free_backref_share_ctx(struct btrfs_backref_share_check_ctx *ctx);
+
 int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
 			struct btrfs_path *path, struct btrfs_key *found_key,
 			u64 *flags);
@@ -79,7 +85,6 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
 			  u64 *found_off);
 int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
 				u64 extent_gen,
-				struct ulist *roots, struct ulist *tmp,
 				struct btrfs_backref_share_check_ctx *ctx);
 
 int __init btrfs_prelim_ref_init(void);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 365ad00a5942..e25e54d2216f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3708,7 +3708,6 @@ static int fiemap_process_hole(struct btrfs_inode *inode,
 			       struct btrfs_backref_share_check_ctx *backref_ctx,
 			       u64 disk_bytenr, u64 extent_offset,
 			       u64 extent_gen,
-			       struct ulist *roots, struct ulist *tmp_ulist,
 			       u64 start, u64 end)
 {
 	const u64 i_size = i_size_read(&inode->vfs_inode);
@@ -3752,10 +3751,9 @@ static int fiemap_process_hole(struct btrfs_inode *inode,
 		if (prealloc_len > 0) {
 			if (!checked_extent_shared && fieinfo->fi_extents_max) {
 				ret = btrfs_is_data_extent_shared(inode,
-							  disk_bytenr,
-							  extent_gen, roots,
-							  tmp_ulist,
-							  backref_ctx);
+								  disk_bytenr,
+								  extent_gen,
+								  backref_ctx);
 				if (ret < 0)
 					return ret;
 				else if (ret > 0)
@@ -3803,8 +3801,7 @@ static int fiemap_process_hole(struct btrfs_inode *inode,
 		if (!checked_extent_shared && fieinfo->fi_extents_max) {
 			ret = btrfs_is_data_extent_shared(inode,
 							  disk_bytenr,
-							  extent_gen, roots,
-							  tmp_ulist,
+							  extent_gen,
 							  backref_ctx);
 			if (ret < 0)
 				return ret;
@@ -3905,8 +3902,6 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
 	struct btrfs_path *path;
 	struct fiemap_cache cache = { 0 };
 	struct btrfs_backref_share_check_ctx *backref_ctx;
-	struct ulist *roots;
-	struct ulist *tmp_ulist;
 	u64 last_extent_end;
 	u64 prev_extent_end;
 	u64 lockstart;
@@ -3914,11 +3909,9 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
 	bool stopped = false;
 	int ret;
 
-	backref_ctx = kzalloc(sizeof(*backref_ctx), GFP_KERNEL);
+	backref_ctx = btrfs_alloc_backref_share_check_ctx();
 	path = btrfs_alloc_path();
-	roots = ulist_alloc(GFP_KERNEL);
-	tmp_ulist = ulist_alloc(GFP_KERNEL);
-	if (!backref_ctx || !path || !roots || !tmp_ulist) {
+	if (!backref_ctx || !path) {
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -3979,7 +3972,6 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
 
 			ret = fiemap_process_hole(inode, fieinfo, &cache,
 						  backref_ctx, 0, 0, 0,
-						  roots, tmp_ulist,
 						  prev_extent_end, range_end);
 			if (ret < 0) {
 				goto out_unlock;
@@ -4021,13 +4013,12 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
 			ret = fiemap_process_hole(inode, fieinfo, &cache,
 						  backref_ctx,
 						  disk_bytenr, extent_offset,
-						  extent_gen, roots, tmp_ulist,
-						  key.offset, extent_end - 1);
+						  extent_gen, key.offset,
+						  extent_end - 1);
 		} else if (disk_bytenr == 0) {
 			/* We have an explicit hole. */
 			ret = fiemap_process_hole(inode, fieinfo, &cache,
 						  backref_ctx, 0, 0, 0,
-						  roots, tmp_ulist,
 						  key.offset, extent_end - 1);
 		} else {
 			/* We have a regular extent. */
@@ -4035,8 +4026,6 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
 				ret = btrfs_is_data_extent_shared(inode,
 								  disk_bytenr,
 								  extent_gen,
-								  roots,
-								  tmp_ulist,
 								  backref_ctx);
 				if (ret < 0)
 					goto out_unlock;
@@ -4087,8 +4076,7 @@ check_eof_delalloc:
 
 	if (!stopped && prev_extent_end < lockend) {
 		ret = fiemap_process_hole(inode, fieinfo, &cache, backref_ctx,
-					  0, 0, 0, roots, tmp_ulist,
-					  prev_extent_end, lockend - 1);
+					  0, 0, 0, prev_extent_end, lockend - 1);
 		if (ret < 0)
 			goto out_unlock;
 		prev_extent_end = lockend;
@@ -4119,10 +4107,8 @@ check_eof_delalloc:
 out_unlock:
 	unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
 out:
-	kfree(backref_ctx);
+	btrfs_free_backref_share_ctx(backref_ctx);
 	btrfs_free_path(path);
-	ulist_free(roots);
-	ulist_free(tmp_ulist);
 	return ret;
 }
 
-- 
cgit 


From b629685803bc0cefbbd29240ea28d57d8a17bcbc Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 11 Oct 2022 13:17:04 +0100
Subject: btrfs: remove roots ulist when checking data extent sharedness

Currently btrfs_is_data_extent_shared() is passing a ulist for the roots
argument of find_parent_nodes(), however it does not use that ulist for
anything and for this context that list always ends up with at most one
element.

Since find_parent_nodes() is able to deal with a NULL ulist for its roots
argument, make btrfs_is_data_extent_shared() pass it NULL and avoid the
burden of allocating memory for the unnused roots ulist, initializing it,
releasing it and allocating one struct ulist_node for it during the call
to find_parent_nodes().

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 6 +-----
 fs/btrfs/backref.h | 1 -
 2 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 4caff3052da7..4d8573fa75b4 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1664,7 +1664,6 @@ struct btrfs_backref_share_check_ctx *btrfs_alloc_backref_share_check_ctx(void)
 		return NULL;
 
 	ulist_init(&ctx->refs);
-	ulist_init(&ctx->roots);
 
 	return ctx;
 }
@@ -1675,7 +1674,6 @@ void btrfs_free_backref_share_ctx(struct btrfs_backref_share_check_ctx *ctx)
 		return;
 
 	ulist_release(&ctx->refs);
-	ulist_release(&ctx->roots);
 	kfree(ctx);
 }
 
@@ -1718,7 +1716,6 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
 	};
 	int level;
 
-	ulist_init(&ctx->roots);
 	ulist_init(&ctx->refs);
 
 	trans = btrfs_join_transaction_nostart(root);
@@ -1742,7 +1739,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
 		bool cached;
 
 		ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, &ctx->refs,
-					&ctx->roots, NULL, &shared, false);
+					NULL, NULL, &shared, false);
 		if (ret == BACKREF_FOUND_SHARED) {
 			/* this is the only condition under which we return 1 */
 			ret = 1;
@@ -1810,7 +1807,6 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
 		up_read(&fs_info->commit_root_sem);
 	}
 out:
-	ulist_release(&ctx->roots);
 	ulist_release(&ctx->refs);
 	return ret;
 }
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 8da0ba6b94a4..5f468f0defda 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -26,7 +26,6 @@ struct btrfs_backref_shared_cache_entry {
 struct btrfs_backref_share_check_ctx {
 	/* Ulists used during backref walking. */
 	struct ulist refs;
-	struct ulist roots;
 	/*
 	 * A path from a root to a leaf that has a file extent item pointing to
 	 * a given data extent should never exceed the maximum b+tree height.
-- 
cgit 


From 56f5c19920d09bbf91efcf80e6ba301923400f4c Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 11 Oct 2022 13:17:05 +0100
Subject: btrfs: remove useless logic when finding parent nodes

At find_parent_nodes(), at its last step, when iterating over all direct
references, we are checking if we have a share context and if we have
a reference with a different root from the one in the share context.
However that logic is pointless because of two reasons:

1) After the previous patch in the series (subject "btrfs: remove roots
   ulist when checking data extent sharedness"), the roots argument is
   always NULL when using a share check context (struct share_check), so
   this code is never triggered;

2) Even before that previous patch, we could not hit this code because
   if we had a reference with a root different from the one in our share
   context, then we would have exited earlier when doing either of the
   following:

      - Adding a second direct ref to the direct refs red black tree
        resulted in extent_is_shared() returning true when called from
        add_direct_ref() -> add_prelim_ref(), after processing delayed
        references or while processing references in the extent tree;

      - When adding a second reference to the indirect refs red black
        tree (same as above, extent_is_shared() returns true);

      - If we only have one indirect reference and no direct references,
        then when resolving it at resolve_indirect_refs() we immediately
        return that the target extent is shared, therefore never reaching
        that loop that iterates over all direct references at
        find_parent_nodes();

      - If we have 1 indirect reference and 1 direct reference, then we
        also exit early because extent_is_shared() ends up returning true
        when called through add_prelim_ref() (by add_direct_ref() or
        add_indirect_ref()) or add_delayed_refs(). Same applies as when
        having a combination of direct, indirect and indirect with missing
        key references.

   This logic had been obsoleted since commit 3ec4d3238ab165 ("btrfs:
   allow backref search checks for shared extents"), which introduced the
   early exits in case an extent is shared.

So just remove that logic, and assert at find_parent_nodes() that when we
have a share context we don't have a roots ulist and that we haven't found
the extent to be directly shared after processing delayed references and
all references from the extent tree.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 4d8573fa75b4..22ab13821ada 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1219,6 +1219,10 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 		.indirect_missing_keys = PREFTREE_INIT
 	};
 
+	/* Roots ulist is not needed when using a sharedness check context. */
+	if (sc)
+		ASSERT(roots == NULL);
+
 	key.objectid = bytenr;
 	key.offset = (u64)-1;
 	if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
@@ -1310,6 +1314,20 @@ again:
 		}
 	}
 
+	/*
+	 * If we have a share context and we reached here, it means the extent
+	 * is not directly shared (no multiple reference items for it),
+	 * otherwise we would have exited earlier with a return value of
+	 * BACKREF_FOUND_SHARED after processing delayed references or while
+	 * processing inline or keyed references from the extent tree.
+	 * The extent may however be indirectly shared through shared subtrees
+	 * as a result from creating snapshots, so we determine below what is
+	 * its parent node, in case we are dealing with a metadata extent, or
+	 * what's the leaf (or leaves), from a fs tree, that has a file extent
+	 * item pointing to it in case we are dealing with a data extent.
+	 */
+	ASSERT(extent_is_shared(sc) == 0);
+
 	btrfs_release_path(path);
 
 	ret = add_missing_keys(fs_info, &preftrees, path->skip_locking == 0);
@@ -1347,11 +1365,6 @@ again:
 		 * and would retain their original ref->count < 0.
 		 */
 		if (roots && ref->count && ref->root_id && ref->parent == 0) {
-			if (sc && ref->root_id != sc->root_objectid) {
-				ret = BACKREF_FOUND_SHARED;
-				goto out;
-			}
-
 			/* no parent == root of tree */
 			ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
 			if (ret < 0)
-- 
cgit 


From 73e339e6ab74cc3edcd1f1ed3d9b822baf8534e1 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 11 Oct 2022 13:17:06 +0100
Subject: btrfs: cache sharedness of the last few data extents during fiemap

During fiemap we process all the file extent items of an inode, by their
file offset order (left to right b+tree order), and then check if the data
extent they point at is shared or not. Until now we didn't cache those
results, we only did it for b+tree nodes/leaves since for each unique
b+tree path we have access to hundreds of file extent items. However, it
is also common to repeat checking the sharedness of a particular data
extent in a very short time window, and the cases that lead to that are
the following:

1) COW writes.

   If have a file extent item like this:

                  [ bytenr X, offset = 0, num_bytes = 512K ]
   file offset    0                                        512K

   Then a 4K write into file offset 64K happens, we end up with the
   following file extent item layout:

                  [ bytenr X, offset = 0, num_bytes = 64K ]
   file offset    0                                       64K

                  [ bytenr Y, offset = 0, num_bytes = 4K ]
   file offset   64K                                     68K

                  [ bytenr X, offset = 68K, num_bytes = 444K ]
   file offset   68K                                         512K

   So during fiemap we well check for the sharedness of the data extent
   with bytenr X twice. Typically for COW writes and for at least
   moderately updated files, we end up with many file extent items that
   point to different sections of the same data extent.

2) Writing into a NOCOW file after a snapshot is taken.

   This happens if the target extent was created in a generation older
   than the generation where the last snapshot for the root (the tree the
   inode belongs to) was made.

   This leads to a scenario like the previous one.

3) Writing into sections of a preallocated extent.

   For example if a file has the following layout:

   [ bytenr X, offset = 0, num_bytes = 1M, type = prealloc ]
   0                                                       1M

   After doing a 4K write into file offset 0 and another 4K write into
   offset 512K, we get the following layout:

      [ bytenr X, offset = 0, num_bytes = 4K, type = regular ]
      0                                                      4K

      [ bytenr X, offset = 4K, num_bytes = 508K, type = prealloc ]
     4K                                                          512K

      [ bytenr X, offset = 512K, num_bytes = 4K, type = regular ]
   512K                                                         516K

      [ bytenr X, offset = 516K, num_bytes = 508K, type = prealloc ]
   516K                                                            1M

   So we end up with 4 consecutive file extent items pointing to the data
   extent at bytenr X.

4) Hole punching in the middle of an extent.

   For example if a file has the following file extent item:

   [ bytenr X, offset = 0, num_bytes = 8M ]
   0                                      8M

   And then hole is punched for the file range [4M, 6M[, we our file
   extent item split into two:

   [ bytenr X, offset = 0, num_bytes = 4M  ]
   0                                       4M

   [ 2M hole, implicit or explicit depending on NO_HOLES feature ]
   4M                                                            6M

   [ bytenr X, offset = 6M, num_bytes = 2M  ]
   6M                                       8M

   Again, we end up with two file extent items pointing to the same
   data extent.

5) When reflinking (clone and deduplication) within the same file.
   This is probably the least common case of all.

In cases 1, 2, 4 and 4, when we have multiple file extent items that point
to the same data extent, their distance is usually short, typically
separated by a few slots in a b+tree leaf (or across sibling leaves). For
case 5, the distance can vary a lot, but it's typically the less common
case.

This change caches the result of the sharedness checks for data extents,
but only for the last 8 extents that we notice that our inode refers to
with multiple file extent items. Whenever we want to check if a data
extent is shared, we lookup the cache which consists of doing a linear
scan of an 8 elements array, and if we find the data extent there, we
return the result and don't check the extent tree and delayed refs.

The array/cache is small so that doing the search has no noticeable
negative impact on the performance in case we don't have file extent items
within a distance of 8 slots that point to the same data extent.

Slots in the cache/array are overwritten in a simple round robin fashion,
as that approach fits very well.

Using this simple approach with only the last 8 data extents seen is
effective as usually when multiple file extents items point to the same
data extent, their distance is within 8 slots. It also uses very little
memory and the time to cache a result or lookup the cache is negligible.

The following test was run on non-debug kernel (Debian's default kernel
config) to measure the impact in the case of COW writes (first example
given above), where we run fiemap after overwriting 33% of the blocks of
a file:

   $ cat test.sh
   #!/bin/bash

   DEV=/dev/sdi
   MNT=/mnt/sdi

   umount $DEV &> /dev/null
   mkfs.btrfs -f $DEV
   mount $DEV $MNT

   FILE_SIZE=$((1 * 1024 * 1024  * 1024))

   # Create the file full of 1M extents.
   xfs_io -f -s -c "pwrite -b 1M -S 0xab 0 $FILE_SIZE" $MNT/foobar

   block_count=$((FILE_SIZE / 4096))
   # Overwrite about 33% of the file blocks.
   overwrite_count=$((block_count / 3))

   echo -e "\nOverwriting $overwrite_count 4K blocks (out of $block_count)..."
   RANDOM=123
   for ((i = 1; i <= $overwrite_count; i++)); do
       off=$(((RANDOM % block_count) * 4096))
       xfs_io -c "pwrite -S 0xcd $off 4K" $MNT/foobar > /dev/null
       echo -ne "\r$i blocks overwritten..."
   done
   echo -e "\n"

   # Unmount and mount to clear all cached metadata.
   umount $MNT
   mount $DEV $MNT

   start=$(date +%s%N)
   filefrag $MNT/foobar
   end=$(date +%s%N)
   dur=$(( (end - start) / 1000000 ))
   echo "fiemap took $dur milliseconds"

   umount $MNT

Result before applying this patch:

   fiemap took 128 milliseconds

Result after applying this patch:

   fiemap took 92 milliseconds   (-28.1%)

The test is somewhat limited in the sense the gains may be higher in
practice, because in the test the filesystem is small, so we have small
fs and extent trees, plus there's no concurrent access to the trees as
well, therefore no lock contention there.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++---
 fs/btrfs/backref.h | 27 +++++++++++++++++++++++++++
 2 files changed, 74 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 22ab13821ada..64bea9b30f6b 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -137,7 +137,25 @@ struct preftrees {
 struct share_check {
 	u64 root_objectid;
 	u64 inum;
+	u64 data_bytenr;
+	/*
+	 * Counts number of inodes that refer to an extent (different inodes in
+	 * the same root or different roots) that we could find. The sharedness
+	 * check typically stops once this counter gets greater than 1, so it
+	 * may not reflect the total number of inodes.
+	 */
 	int share_count;
+	/*
+	 * The number of times we found our inode refers to the data extent we
+	 * are determining the sharedness. In other words, how many file extent
+	 * items we could find for our inode that point to our target data
+	 * extent. The value we get here after finishing the extent sharedness
+	 * check may be smaller than reality, but if it ends up being greater
+	 * than 1, then we know for sure the inode has multiple file extent
+	 * items that point to our inode, and we can safely assume it's useful
+	 * to cache the sharedness check result.
+	 */
+	int self_ref_count;
 	bool have_delayed_delete_refs;
 };
 
@@ -207,7 +225,7 @@ static int prelim_ref_compare(struct prelim_ref *ref1,
 }
 
 static void update_share_count(struct share_check *sc, int oldcount,
-			       int newcount)
+			       int newcount, struct prelim_ref *newref)
 {
 	if ((!sc) || (oldcount == 0 && newcount < 1))
 		return;
@@ -216,6 +234,11 @@ static void update_share_count(struct share_check *sc, int oldcount,
 		sc->share_count--;
 	else if (oldcount < 1 && newcount > 0)
 		sc->share_count++;
+
+	if (newref->root_id == sc->root_objectid &&
+	    newref->wanted_disk_byte == sc->data_bytenr &&
+	    newref->key_for_search.objectid == sc->inum)
+		sc->self_ref_count += newref->count;
 }
 
 /*
@@ -266,14 +289,14 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info,
 			 * BTRFS_[ADD|DROP]_DELAYED_REF actions.
 			 */
 			update_share_count(sc, ref->count,
-					   ref->count + newref->count);
+					   ref->count + newref->count, newref);
 			ref->count += newref->count;
 			free_pref(newref);
 			return;
 		}
 	}
 
-	update_share_count(sc, 0, newref->count);
+	update_share_count(sc, 0, newref->count, newref);
 	preftree->count++;
 	trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count);
 	rb_link_node(&newref->rbnode, parent, p);
@@ -1724,11 +1747,18 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
 	struct share_check shared = {
 		.root_objectid = root->root_key.objectid,
 		.inum = btrfs_ino(inode),
+		.data_bytenr = bytenr,
 		.share_count = 0,
+		.self_ref_count = 0,
 		.have_delayed_delete_refs = false,
 	};
 	int level;
 
+	for (int i = 0; i < BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE; i++) {
+		if (ctx->prev_extents_cache[i].bytenr == bytenr)
+			return ctx->prev_extents_cache[i].is_shared;
+	}
+
 	ulist_init(&ctx->refs);
 
 	trans = btrfs_join_transaction_nostart(root);
@@ -1813,6 +1843,20 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
 		cond_resched();
 	}
 
+	/*
+	 * Cache the sharedness result for the data extent if we know our inode
+	 * has more than 1 file extent item that refers to the data extent.
+	 */
+	if (ret >= 0 && shared.self_ref_count > 1) {
+		int slot = ctx->prev_extents_cache_slot;
+
+		ctx->prev_extents_cache[slot].bytenr = shared.data_bytenr;
+		ctx->prev_extents_cache[slot].is_shared = (ret == 1);
+
+		slot = (slot + 1) % BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE;
+		ctx->prev_extents_cache_slot = slot;
+	}
+
 	if (trans) {
 		btrfs_put_tree_mod_seq(fs_info, &elem);
 		btrfs_end_transaction(trans);
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 5f468f0defda..fda78db50be6 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -23,6 +23,8 @@ struct btrfs_backref_shared_cache_entry {
 	bool is_shared;
 };
 
+#define BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE 8
+
 struct btrfs_backref_share_check_ctx {
 	/* Ulists used during backref walking. */
 	struct ulist refs;
@@ -32,6 +34,31 @@ struct btrfs_backref_share_check_ctx {
 	 */
 	struct btrfs_backref_shared_cache_entry path_cache_entries[BTRFS_MAX_LEVEL];
 	bool use_path_cache;
+	/*
+	 * Cache the sharedness result for the last few extents we have found,
+	 * but only for extents for which we have multiple file extent items
+	 * that point to them.
+	 * It's very common to have several file extent items that point to the
+	 * same extent (bytenr) but with different offsets and lengths. This
+	 * typically happens for COW writes, partial writes into prealloc
+	 * extents, NOCOW writes after snapshoting a root, hole punching or
+	 * reflinking within the same file (less common perhaps).
+	 * So keep a small cache with the lookup results for the extent pointed
+	 * by the last few file extent items. This cache is checked, with a
+	 * linear scan, whenever btrfs_is_data_extent_shared() is called, so
+	 * it must be small so that it does not negatively affect performance in
+	 * case we don't have multiple file extent items that point to the same
+	 * data extent.
+	 */
+	struct {
+		u64 bytenr;
+		bool is_shared;
+	} prev_extents_cache[BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE];
+	/*
+	 * The slot in the prev_extents_cache array that will be used for
+	 * storing the sharedness result of a new data extent.
+	 */
+	int prev_extents_cache_slot;
 };
 
 typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
-- 
cgit 


From 583f4ac56254c844cbbc9f23744e1f2efbf42fc7 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 11 Oct 2022 13:17:07 +0100
Subject: btrfs: move up backref sharedness cache store and lookup functions

Move the static functions to lookup and store sharedness check of an
extent buffer to a location above find_all_parents(), because in the
next patch the lookup function will be used by find_all_parents().
The store function is also moved just because it's the counter part
to the lookup function and it's best to have their definitions close
together.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 236 ++++++++++++++++++++++++++---------------------------
 1 file changed, 118 insertions(+), 118 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 64bea9b30f6b..1b1575b3a7b0 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1198,6 +1198,124 @@ static int add_keyed_refs(struct btrfs_root *extent_root,
 	return ret;
 }
 
+/*
+ * The caller has joined a transaction or is holding a read lock on the
+ * fs_info->commit_root_sem semaphore, so no need to worry about the root's last
+ * snapshot field changing while updating or checking the cache.
+ */
+static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx,
+					struct btrfs_root *root,
+					u64 bytenr, int level, bool *is_shared)
+{
+	struct btrfs_backref_shared_cache_entry *entry;
+
+	if (!ctx->use_path_cache)
+		return false;
+
+	if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL))
+		return false;
+
+	/*
+	 * Level -1 is used for the data extent, which is not reliable to cache
+	 * because its reference count can increase or decrease without us
+	 * realizing. We cache results only for extent buffers that lead from
+	 * the root node down to the leaf with the file extent item.
+	 */
+	ASSERT(level >= 0);
+
+	entry = &ctx->path_cache_entries[level];
+
+	/* Unused cache entry or being used for some other extent buffer. */
+	if (entry->bytenr != bytenr)
+		return false;
+
+	/*
+	 * We cached a false result, but the last snapshot generation of the
+	 * root changed, so we now have a snapshot. Don't trust the result.
+	 */
+	if (!entry->is_shared &&
+	    entry->gen != btrfs_root_last_snapshot(&root->root_item))
+		return false;
+
+	/*
+	 * If we cached a true result and the last generation used for dropping
+	 * a root changed, we can not trust the result, because the dropped root
+	 * could be a snapshot sharing this extent buffer.
+	 */
+	if (entry->is_shared &&
+	    entry->gen != btrfs_get_last_root_drop_gen(root->fs_info))
+		return false;
+
+	*is_shared = entry->is_shared;
+	/*
+	 * If the node at this level is shared, than all nodes below are also
+	 * shared. Currently some of the nodes below may be marked as not shared
+	 * because we have just switched from one leaf to another, and switched
+	 * also other nodes above the leaf and below the current level, so mark
+	 * them as shared.
+	 */
+	if (*is_shared) {
+		for (int i = 0; i < level; i++) {
+			ctx->path_cache_entries[i].is_shared = true;
+			ctx->path_cache_entries[i].gen = entry->gen;
+		}
+	}
+
+	return true;
+}
+
+/*
+ * The caller has joined a transaction or is holding a read lock on the
+ * fs_info->commit_root_sem semaphore, so no need to worry about the root's last
+ * snapshot field changing while updating or checking the cache.
+ */
+static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx,
+				       struct btrfs_root *root,
+				       u64 bytenr, int level, bool is_shared)
+{
+	struct btrfs_backref_shared_cache_entry *entry;
+	u64 gen;
+
+	if (!ctx->use_path_cache)
+		return;
+
+	if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL))
+		return;
+
+	/*
+	 * Level -1 is used for the data extent, which is not reliable to cache
+	 * because its reference count can increase or decrease without us
+	 * realizing. We cache results only for extent buffers that lead from
+	 * the root node down to the leaf with the file extent item.
+	 */
+	ASSERT(level >= 0);
+
+	if (is_shared)
+		gen = btrfs_get_last_root_drop_gen(root->fs_info);
+	else
+		gen = btrfs_root_last_snapshot(&root->root_item);
+
+	entry = &ctx->path_cache_entries[level];
+	entry->bytenr = bytenr;
+	entry->is_shared = is_shared;
+	entry->gen = gen;
+
+	/*
+	 * If we found an extent buffer is shared, set the cache result for all
+	 * extent buffers below it to true. As nodes in the path are COWed,
+	 * their sharedness is moved to their children, and if a leaf is COWed,
+	 * then the sharedness of a data extent becomes direct, the refcount of
+	 * data extent is increased in the extent item at the extent tree.
+	 */
+	if (is_shared) {
+		for (int i = 0; i < level; i++) {
+			entry = &ctx->path_cache_entries[i];
+			entry->is_shared = is_shared;
+			entry->gen = gen;
+		}
+	}
+}
+
 /*
  * this adds all existing backrefs (inline backrefs, backrefs and delayed
  * refs) for the given bytenr to the refs list, merges duplicates and resolves
@@ -1573,124 +1691,6 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-/*
- * The caller has joined a transaction or is holding a read lock on the
- * fs_info->commit_root_sem semaphore, so no need to worry about the root's last
- * snapshot field changing while updating or checking the cache.
- */
-static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx,
-					struct btrfs_root *root,
-					u64 bytenr, int level, bool *is_shared)
-{
-	struct btrfs_backref_shared_cache_entry *entry;
-
-	if (!ctx->use_path_cache)
-		return false;
-
-	if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL))
-		return false;
-
-	/*
-	 * Level -1 is used for the data extent, which is not reliable to cache
-	 * because its reference count can increase or decrease without us
-	 * realizing. We cache results only for extent buffers that lead from
-	 * the root node down to the leaf with the file extent item.
-	 */
-	ASSERT(level >= 0);
-
-	entry = &ctx->path_cache_entries[level];
-
-	/* Unused cache entry or being used for some other extent buffer. */
-	if (entry->bytenr != bytenr)
-		return false;
-
-	/*
-	 * We cached a false result, but the last snapshot generation of the
-	 * root changed, so we now have a snapshot. Don't trust the result.
-	 */
-	if (!entry->is_shared &&
-	    entry->gen != btrfs_root_last_snapshot(&root->root_item))
-		return false;
-
-	/*
-	 * If we cached a true result and the last generation used for dropping
-	 * a root changed, we can not trust the result, because the dropped root
-	 * could be a snapshot sharing this extent buffer.
-	 */
-	if (entry->is_shared &&
-	    entry->gen != btrfs_get_last_root_drop_gen(root->fs_info))
-		return false;
-
-	*is_shared = entry->is_shared;
-	/*
-	 * If the node at this level is shared, than all nodes below are also
-	 * shared. Currently some of the nodes below may be marked as not shared
-	 * because we have just switched from one leaf to another, and switched
-	 * also other nodes above the leaf and below the current level, so mark
-	 * them as shared.
-	 */
-	if (*is_shared) {
-		for (int i = 0; i < level; i++) {
-			ctx->path_cache_entries[i].is_shared = true;
-			ctx->path_cache_entries[i].gen = entry->gen;
-		}
-	}
-
-	return true;
-}
-
-/*
- * The caller has joined a transaction or is holding a read lock on the
- * fs_info->commit_root_sem semaphore, so no need to worry about the root's last
- * snapshot field changing while updating or checking the cache.
- */
-static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx,
-				       struct btrfs_root *root,
-				       u64 bytenr, int level, bool is_shared)
-{
-	struct btrfs_backref_shared_cache_entry *entry;
-	u64 gen;
-
-	if (!ctx->use_path_cache)
-		return;
-
-	if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL))
-		return;
-
-	/*
-	 * Level -1 is used for the data extent, which is not reliable to cache
-	 * because its reference count can increase or decrease without us
-	 * realizing. We cache results only for extent buffers that lead from
-	 * the root node down to the leaf with the file extent item.
-	 */
-	ASSERT(level >= 0);
-
-	if (is_shared)
-		gen = btrfs_get_last_root_drop_gen(root->fs_info);
-	else
-		gen = btrfs_root_last_snapshot(&root->root_item);
-
-	entry = &ctx->path_cache_entries[level];
-	entry->bytenr = bytenr;
-	entry->is_shared = is_shared;
-	entry->gen = gen;
-
-	/*
-	 * If we found an extent buffer is shared, set the cache result for all
-	 * extent buffers below it to true. As nodes in the path are COWed,
-	 * their sharedness is moved to their children, and if a leaf is COWed,
-	 * then the sharedness of a data extent becomes direct, the refcount of
-	 * data extent is increased in the extent item at the extent tree.
-	 */
-	if (is_shared) {
-		for (int i = 0; i < level; i++) {
-			entry = &ctx->path_cache_entries[i];
-			entry->is_shared = is_shared;
-			entry->gen = gen;
-		}
-	}
-}
-
 struct btrfs_backref_share_check_ctx *btrfs_alloc_backref_share_check_ctx(void)
 {
 	struct btrfs_backref_share_check_ctx *ctx;
-- 
cgit 


From 877c14767f106a5c8af271ebf1294c514eb76d0c Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 11 Oct 2022 13:17:08 +0100
Subject: btrfs: avoid duplicated resolution of indirect backrefs during fiemap

During fiemap, when determining if a data extent is shared or not, if we
don't find the extent is directly shared, then we need to determine if
it's shared through subtrees. For that we need to resolve the indirect
reference we found in order to figure out the path in the inode's fs tree,
which is a path starting at the fs tree's root node and going down to the
leaf that contains the file extent item that points to the data extent.
We then proceed to determine if any extent buffer in that path is shared
with other trees or not.

Currently whenever we find the data extent that a file extent item points
to is not directly shared, we always resolve the path in the fs tree, and
then check if any extent buffer in the path is shared. This is a lot of
work and when we have file extent items that belong to the same leaf, we
have the same path, so we only need to calculate it once.

This change does that, it keeps track of the current and previous leaf,
and when we find that a data extent is not directly shared, we try to
compute the fs tree path only once and then use it for every other file
extent item in the same leaf, using the existing cached path result for
the leaf as long as the cache results are valid.

This saves us from doing expensive b+tree searches in the fs tree of our
target inode, as well as other minor work.

The following test was run on a non-debug kernel (Debian's default kernel
config):

   $ cat test-with-snapshots.sh
   #!/bin/bash

   DEV=/dev/sdi
   MNT=/mnt/sdi

   umount $DEV &> /dev/null
   mkfs.btrfs -f $DEV
   # Use compression to quickly create files with a lot of extents
   # (each with a size of 128K).
   mount -o compress=lzo $DEV $MNT

   # 40G gives 327680 extents, each with a size of 128K.
   xfs_io -f -c "pwrite -S 0xab -b 1M 0 40G" $MNT/foobar

   # Add some more files to increase the size of the fs and extent
   # trees (in the real world there's a lot of files and extents
   # from other files).
   xfs_io -f -c "pwrite -S 0xcd -b 1M 0 20G" $MNT/file1
   xfs_io -f -c "pwrite -S 0xef -b 1M 0 20G" $MNT/file2
   xfs_io -f -c "pwrite -S 0x73 -b 1M 0 20G" $MNT/file3

   # Create a snapshot so all the extents become indirectly shared
   # through subtrees, with a generation less than or equals to the
   # generation used to create the snapshot.
   btrfs subvolume snapshot -r $MNT $MNT/snap1

   umount $MNT
   mount -o compress=lzo $DEV $MNT

   start=$(date +%s%N)
   filefrag $MNT/foobar
   end=$(date +%s%N)
   dur=$(( (end - start) / 1000000 ))
   echo "fiemap took $dur milliseconds (metadata not cached)"
   echo

   start=$(date +%s%N)
   filefrag $MNT/foobar
   end=$(date +%s%N)
   dur=$(( (end - start) / 1000000 ))
   echo "fiemap took $dur milliseconds (metadata cached)"

   umount $MNT

Result before applying this patch:

   (...)
   /mnt/sdi/foobar: 327680 extents found
   fiemap took 1204 milliseconds (metadata not cached)

   /mnt/sdi/foobar: 327680 extents found
   fiemap took 729 milliseconds (metadata cached)

Result after applying this patch:

   (...)
   /mnt/sdi/foobar: 327680 extents found
   fiemap took 732 milliseconds (metadata not cached)

   /mnt/sdi/foobar: 327680 extents found
   fiemap took 421 milliseconds (metadata cached)

That's a -46.1% total reduction for the metadata not cached case, and
a -42.2% reduction for the cached metadata case.

The test is somewhat limited in the sense the gains may be higher in
practice, because in the test the filesystem is small, so we have small
fs and extent trees, plus there's no concurrent access to the trees as
well, therefore no lock contention there.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c   | 64 ++++++++++++++++++++++++++++++++++++++++++++--------
 fs/btrfs/backref.h   | 13 +++++++++++
 fs/btrfs/extent_io.c |  2 ++
 3 files changed, 69 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 1b1575b3a7b0..94a3c6deafbb 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -16,8 +16,9 @@
 #include "misc.h"
 #include "tree-mod-log.h"
 
-/* Just an arbitrary number so we can be sure this happened */
-#define BACKREF_FOUND_SHARED 6
+/* Just arbitrary numbers so we can be sure one of these happened. */
+#define BACKREF_FOUND_SHARED     6
+#define BACKREF_FOUND_NOT_SHARED 7
 
 struct extent_inode_elem {
 	u64 inum;
@@ -135,7 +136,8 @@ struct preftrees {
  *  - decremented when a ref->count transitions to <1
  */
 struct share_check {
-	u64 root_objectid;
+	struct btrfs_backref_share_check_ctx *ctx;
+	struct btrfs_root *root;
 	u64 inum;
 	u64 data_bytenr;
 	/*
@@ -235,7 +237,7 @@ static void update_share_count(struct share_check *sc, int oldcount,
 	else if (oldcount < 1 && newcount > 0)
 		sc->share_count++;
 
-	if (newref->root_id == sc->root_objectid &&
+	if (newref->root_id == sc->root->root_key.objectid &&
 	    newref->wanted_disk_byte == sc->data_bytenr &&
 	    newref->key_for_search.objectid == sc->inum)
 		sc->self_ref_count += newref->count;
@@ -742,7 +744,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 			continue;
 		}
 
-		if (sc && ref->root_id != sc->root_objectid) {
+		if (sc && ref->root_id != sc->root->root_key.objectid) {
 			free_pref(ref);
 			ret = BACKREF_FOUND_SHARED;
 			goto out;
@@ -1469,6 +1471,44 @@ again:
 	 */
 	ASSERT(extent_is_shared(sc) == 0);
 
+	/*
+	 * If we are here for a data extent and we have a share_check structure
+	 * it means the data extent is not directly shared (does not have
+	 * multiple reference items), so we have to check if a path in the fs
+	 * tree (going from the root node down to the leaf that has the file
+	 * extent item pointing to the data extent) is shared, that is, if any
+	 * of the extent buffers in the path is referenced by other trees.
+	 */
+	if (sc && bytenr == sc->data_bytenr) {
+		/*
+		 * If we are only determining if a data extent is shared or not
+		 * and the corresponding file extent item is located in the same
+		 * leaf as the previous file extent item, we can skip resolving
+		 * indirect references for a data extent, since the fs tree path
+		 * is the same (same leaf, so same path). We skip as long as the
+		 * cached result for the leaf is valid and only if there's only
+		 * one file extent item pointing to the data extent, because in
+		 * the case of multiple file extent items, they may be located
+		 * in different leaves and therefore we have multiple paths.
+		 */
+		if (sc->ctx->curr_leaf_bytenr == sc->ctx->prev_leaf_bytenr &&
+		    sc->self_ref_count == 1) {
+			bool cached;
+			bool is_shared;
+
+			cached = lookup_backref_shared_cache(sc->ctx, sc->root,
+						     sc->ctx->curr_leaf_bytenr,
+						     0, &is_shared);
+			if (cached) {
+				if (is_shared)
+					ret = BACKREF_FOUND_SHARED;
+				else
+					ret = BACKREF_FOUND_NOT_SHARED;
+				goto out;
+			}
+		}
+	}
+
 	btrfs_release_path(path);
 
 	ret = add_missing_keys(fs_info, &preftrees, path->skip_locking == 0);
@@ -1745,7 +1785,8 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
 	struct btrfs_seq_list elem = BTRFS_SEQ_LIST_INIT(elem);
 	int ret = 0;
 	struct share_check shared = {
-		.root_objectid = root->root_key.objectid,
+		.ctx = ctx,
+		.root = root,
 		.inum = btrfs_ino(inode),
 		.data_bytenr = bytenr,
 		.share_count = 0,
@@ -1783,12 +1824,13 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
 
 		ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, &ctx->refs,
 					NULL, NULL, &shared, false);
-		if (ret == BACKREF_FOUND_SHARED) {
-			/* this is the only condition under which we return 1 */
-			ret = 1;
+		if (ret == BACKREF_FOUND_SHARED ||
+		    ret == BACKREF_FOUND_NOT_SHARED) {
+			/* If shared must return 1, otherwise return 0. */
+			ret = (ret == BACKREF_FOUND_SHARED) ? 1 : 0;
 			if (level >= 0)
 				store_backref_shared_cache(ctx, root, bytenr,
-							   level, true);
+							   level, ret == 1);
 			break;
 		}
 		if (ret < 0 && ret != -ENOENT)
@@ -1865,6 +1907,8 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
 	}
 out:
 	ulist_release(&ctx->refs);
+	ctx->prev_leaf_bytenr = ctx->curr_leaf_bytenr;
+
 	return ret;
 }
 
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index fda78db50be6..6dac462430b0 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -28,6 +28,19 @@ struct btrfs_backref_shared_cache_entry {
 struct btrfs_backref_share_check_ctx {
 	/* Ulists used during backref walking. */
 	struct ulist refs;
+	/*
+	 * The current leaf the caller of btrfs_is_data_extent_shared() is at.
+	 * Typically the caller (at the moment only fiemap) tries to determine
+	 * the sharedness of data extents point by file extent items from entire
+	 * leaves.
+	 */
+	u64 curr_leaf_bytenr;
+	/*
+	 * The previous leaf the caller was at in the previous call to
+	 * btrfs_is_data_extent_shared(). This may be the same as the current
+	 * leaf. On the first call it must be 0.
+	 */
+	u64 prev_leaf_bytenr;
 	/*
 	 * A path from a root to a leaf that has a file extent item pointing to
 	 * a given data extent should never exceed the maximum b+tree height.
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e25e54d2216f..4e4f28387ace 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3966,6 +3966,8 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
 		if (extent_end <= lockstart)
 			goto next_item;
 
+		backref_ctx->curr_leaf_bytenr = leaf->start;
+
 		/* We have in implicit hole (NO_HOLES feature enabled). */
 		if (prev_extent_end < key.offset) {
 			const u64 range_end = min(key.offset, lockend) - 1;
-- 
cgit 


From 6976201f188f0a899fe5f0dfde32b9855d1d22cc Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 11 Oct 2022 13:17:09 +0100
Subject: btrfs: avoid unnecessary resolution of indirect backrefs during
 fiemap

During fiemap, when determining if a data extent is shared or not, if we
don't find the extent is directly shared, then we need to determine if
it's shared through subtrees. For that we need to resolve the indirect
reference we found in order to figure out the path in the inode's fs tree,
which is a path starting at the fs tree's root node and going down to the
leaf that contains the file extent item that points to the data extent.
We then proceed to determine if any extent buffer in that path is shared
with other trees or not.

However when the generation of the data extent is more recent than the
last generation used to snapshot the root, we don't need to determine
the path, since the data extent can not be shared through snapshots.
For this case we currently still determine the leaf of that path (at
find_parent_nodes(), but then stop determining the other nodes in the
path (at btrfs_is_data_extent_shared()) as it's pointless.

So do the check of the data extent's generation earlier, at
find_parent_nodes(), before trying to resolve the indirect reference to
determine the leaf in the path. This saves us from doing one expensive
b+tree search in the fs tree of our target inode, as well as other minor
work.

The following test was run on a non-debug kernel (Debian's default kernel
config):

   $ cat test-fiemap.sh
   #!/bin/bash

   DEV=/dev/sdi
   MNT=/mnt/sdi

   umount $DEV &> /dev/null
   mkfs.btrfs -f $DEV
   # Use compression to quickly create files with a lot of extents
   # (each with a size of 128K).
   mount -o compress=lzo $DEV $MNT

   # 40G gives 327680 extents, each with a size of 128K.
   xfs_io -f -c "pwrite -S 0xab -b 1M 0 40G" $MNT/foobar

   # Add some more files to increase the size of the fs and extent
   # trees (in the real world there's a lot of files and extents
   # from other files).
   xfs_io -f -c "pwrite -S 0xcd -b 1M 0 20G" $MNT/file1
   xfs_io -f -c "pwrite -S 0xef -b 1M 0 20G" $MNT/file2
   xfs_io -f -c "pwrite -S 0x73 -b 1M 0 20G" $MNT/file3

   umount $MNT
   mount -o compress=lzo $DEV $MNT

   start=$(date +%s%N)
   filefrag $MNT/foobar
   end=$(date +%s%N)
   dur=$(( (end - start) / 1000000 ))
   echo "fiemap took $dur milliseconds (metadata not cached)"
   echo

   start=$(date +%s%N)
   filefrag $MNT/foobar
   end=$(date +%s%N)
   dur=$(( (end - start) / 1000000 ))
   echo "fiemap took $dur milliseconds (metadata cached)"

   umount $MNT

Before applying this patch:

   (...)
   /mnt/sdi/foobar: 327680 extents found
   fiemap took 1285 milliseconds (metadata not cached)

   /mnt/sdi/foobar: 327680 extents found
   fiemap took 742 milliseconds (metadata cached)

After applying this patch:

   (...)
   /mnt/sdi/foobar: 327680 extents found
   fiemap took 689 milliseconds (metadata not cached)

   /mnt/sdi/foobar: 327680 extents found
   fiemap took 393 milliseconds (metadata cached)

That's a -46.4% total reduction for the metadata not cached case, and
a -47.0% reduction for the cached metadata case.

The test is somewhat limited in the sense the gains may be higher in
practice, because in the test the filesystem is small, so we have small
fs and extent trees, plus there's no concurrent access to the trees as
well, therefore no lock contention there.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 94a3c6deafbb..232f49415ab9 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -140,6 +140,7 @@ struct share_check {
 	struct btrfs_root *root;
 	u64 inum;
 	u64 data_bytenr;
+	u64 data_extent_gen;
 	/*
 	 * Counts number of inodes that refer to an extent (different inodes in
 	 * the same root or different roots) that we could find. The sharedness
@@ -1480,6 +1481,21 @@ again:
 	 * of the extent buffers in the path is referenced by other trees.
 	 */
 	if (sc && bytenr == sc->data_bytenr) {
+		/*
+		 * If our data extent is from a generation more recent than the
+		 * last generation used to snapshot the root, then we know that
+		 * it can not be shared through subtrees, so we can skip
+		 * resolving indirect references, there's no point in
+		 * determining the extent buffers for the path from the fs tree
+		 * root node down to the leaf that has the file extent item that
+		 * points to the data extent.
+		 */
+		if (sc->data_extent_gen >
+		    btrfs_root_last_snapshot(&sc->root->root_item)) {
+			ret = BACKREF_FOUND_NOT_SHARED;
+			goto out;
+		}
+
 		/*
 		 * If we are only determining if a data extent is shared or not
 		 * and the corresponding file extent item is located in the same
@@ -1789,6 +1805,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
 		.root = root,
 		.inum = btrfs_ino(inode),
 		.data_bytenr = bytenr,
+		.data_extent_gen = extent_gen,
 		.share_count = 0,
 		.self_ref_count = 0,
 		.have_delayed_delete_refs = false,
@@ -1836,17 +1853,6 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
 		if (ret < 0 && ret != -ENOENT)
 			break;
 		ret = 0;
-		/*
-		 * If our data extent is not shared through reflinks and it was
-		 * created in a generation after the last one used to create a
-		 * snapshot of the inode's root, then it can not be shared
-		 * indirectly through subtrees, as that can only happen with
-		 * snapshots. In this case bail out, no need to check for the
-		 * sharedness of extent buffers.
-		 */
-		if (level == -1 &&
-		    extent_gen > btrfs_root_last_snapshot(&root->root_item))
-			break;
 
 		/*
 		 * If our data extent was not directly shared (without multiple
-- 
cgit 


From cc4804bfd6392bcb626a687689b36fd9cda87c11 Mon Sep 17 00:00:00 2001
From: Boris Burkov <boris@bur.io>
Date: Thu, 13 Oct 2022 15:52:09 -0700
Subject: btrfs: skip reclaim if block_group is empty

As we delete extents from a block group, at some deletion we cross below
the reclaim threshold. It is possible we are still in the middle of
deleting more extents and might soon hit 0. If the block group is empty
by the time the reclaim worker runs, we will still relocate it.

This works just fine, as relocating an empty block group ultimately
results in properly deleting it. However, we have more direct ways of
removing empty block groups in the cleaner thread. Those are either
async discard or the unused_bgs list. In fact, when we decide whether to
relocate a block group during extent deletion, we do check for emptiness
and prefer the discard/unused_bgs mechanisms when possible.

Not using relocation for this case reduces some modest overhead from
empty bg relocation:

- extra transactions
- extra metadata use/churn for creating relocation metadata
- trying to read the extent tree to look for extents (and in this case
  finding none)

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Boris Burkov <boris@bur.io>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 3f8b1cbbbc43..684401aa014a 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1606,6 +1606,24 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
 			up_write(&space_info->groups_sem);
 			goto next;
 		}
+		if (bg->used == 0) {
+			/*
+			 * It is possible that we trigger relocation on a block
+			 * group as its extents are deleted and it first goes
+			 * below the threshold, then shortly after goes empty.
+			 *
+			 * In this case, relocating it does delete it, but has
+			 * some overhead in relocation specific metadata, looking
+			 * for the non-existent extents and running some extra
+			 * transactions, which we can avoid by using one of the
+			 * other mechanisms for dealing with empty block groups.
+			 */
+			if (!btrfs_test_opt(fs_info, DISCARD_ASYNC))
+				btrfs_mark_bg_unused(bg);
+			spin_unlock(&bg->lock);
+			up_write(&space_info->groups_sem);
+			goto next;
+		}
 		spin_unlock(&bg->lock);
 
 		/* Get out fast, in case we're unmounting the filesystem */
-- 
cgit 


From 81531225e5bd50ce628816cc1445c8b52aa99db2 Mon Sep 17 00:00:00 2001
From: Boris Burkov <boris@bur.io>
Date: Thu, 13 Oct 2022 15:52:10 -0700
Subject: btrfs: re-check reclaim condition in reclaim worker

I have observed the following case play out and lead to unnecessary
relocations:

1. write a file across multiple block groups
2. delete the file
3. several block groups fall below the reclaim threshold
4. reclaim the first, moving extents into the others
5. reclaim the others which are now actually very full, leading to poor
   reclaim behavior with lots of writing, allocating new block groups,
   etc.

I believe the risk of missing some reasonable reclaims is worth it
when traded off against the savings of avoiding overfull reclaims.

Going forward, it could be interesting to make the check more advanced
(zoned aware, fragmentation aware, etc...) so that it can be a really
strong signal both at extent delete and reclaim time.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Boris Burkov <boris@bur.io>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c | 65 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 40 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 684401aa014a..b1f4acdeafb9 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1539,6 +1539,30 @@ static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info)
 	return true;
 }
 
+static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed)
+{
+	const struct btrfs_space_info *space_info = bg->space_info;
+	const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold);
+	const u64 new_val = bg->used;
+	const u64 old_val = new_val + bytes_freed;
+	u64 thresh;
+
+	if (reclaim_thresh == 0)
+		return false;
+
+	thresh = div_factor_fine(bg->length, reclaim_thresh);
+
+	/*
+	 * If we were below the threshold before don't reclaim, we are likely a
+	 * brand new block group and we don't want to relocate new block groups.
+	 */
+	if (old_val < thresh)
+		return false;
+	if (new_val >= thresh)
+		return false;
+	return true;
+}
+
 void btrfs_reclaim_bgs_work(struct work_struct *work)
 {
 	struct btrfs_fs_info *fs_info =
@@ -1623,6 +1647,22 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
 			spin_unlock(&bg->lock);
 			up_write(&space_info->groups_sem);
 			goto next;
+
+		}
+		/*
+		 * The block group might no longer meet the reclaim condition by
+		 * the time we get around to reclaiming it, so to avoid
+		 * reclaiming overly full block_groups, skip reclaiming them.
+		 *
+		 * Since the decision making process also depends on the amount
+		 * being freed, pass in a fake giant value to skip that extra
+		 * check, which is more meaningful when adding to the list in
+		 * the first place.
+		 */
+		if (!should_reclaim_block_group(bg, bg->length)) {
+			spin_unlock(&bg->lock);
+			up_write(&space_info->groups_sem);
+			goto next;
 		}
 		spin_unlock(&bg->lock);
 
@@ -3241,31 +3281,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
 	return ret;
 }
 
-static inline bool should_reclaim_block_group(struct btrfs_block_group *bg,
-					      u64 bytes_freed)
-{
-	const struct btrfs_space_info *space_info = bg->space_info;
-	const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold);
-	const u64 new_val = bg->used;
-	const u64 old_val = new_val + bytes_freed;
-	u64 thresh;
-
-	if (reclaim_thresh == 0)
-		return false;
-
-	thresh = div_factor_fine(bg->length, reclaim_thresh);
-
-	/*
-	 * If we were below the threshold before don't reclaim, we are likely a
-	 * brand new block group and we don't want to relocate new block groups.
-	 */
-	if (old_val < thresh)
-		return false;
-	if (new_val >= thresh)
-		return false;
-	return true;
-}
-
 int btrfs_update_block_group(struct btrfs_trans_handle *trans,
 			     u64 bytenr, u64 num_bytes, bool alloc)
 {
-- 
cgit 


From 879b2221983184cd61fe852f90f3c0d6dd647f67 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 13 Oct 2022 11:36:25 +0100
Subject: btrfs: switch GFP_ATOMIC to GFP_NOFS when fixing up low keys

When fixing up the first key of each node above the current level, at
fixup_low_keys(), we are doing a GFP_ATOMIC allocation for inserting an
operation record for the tree mod log. However we can do just fine with
GFP_NOFS nowadays. The need for GFP_ATOMIC was for the old days when we
had custom locks with spinning behaviour for extent buffers and we were
in spinning mode while at fixup_low_keys(). Now we use rw semaphores for
extent buffer locks, so we can safely use GFP_NOFS.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 5f461e05cc9c..e6182d530377 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2418,7 +2418,7 @@ static void fixup_low_keys(struct btrfs_path *path,
 			break;
 		t = path->nodes[i];
 		ret = btrfs_tree_mod_log_insert_key(t, tslot,
-				BTRFS_MOD_LOG_KEY_REPLACE, GFP_ATOMIC);
+				BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
 		BUG_ON(ret < 0);
 		btrfs_set_node_key(t, key, tslot);
 		btrfs_mark_buffer_dirty(path->nodes[i]);
-- 
cgit 


From 33cff222faffefc57f57326547e68133a7816e29 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 14 Oct 2022 14:44:33 +0100
Subject: btrfs: remove gfp_t flag from btrfs_tree_mod_log_insert_key()

All callers of btrfs_tree_mod_log_insert_key() are now passing a GFP_NOFS
flag to it, so remove the flag from it and from alloc_tree_mod_elem() and
use it directly within alloc_tree_mod_elem().

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c        | 16 ++++++++--------
 fs/btrfs/tree-mod-log.c | 19 +++++++++----------
 fs/btrfs/tree-mod-log.h |  2 +-
 3 files changed, 18 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index e6182d530377..d645933ef135 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -489,7 +489,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	} else {
 		WARN_ON(trans->transid != btrfs_header_generation(parent));
 		btrfs_tree_mod_log_insert_key(parent, parent_slot,
-					      BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
+					      BTRFS_MOD_LOG_KEY_REPLACE);
 		btrfs_set_node_blockptr(parent, parent_slot,
 					cow->start);
 		btrfs_set_node_ptr_generation(parent, parent_slot,
@@ -1018,7 +1018,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 			struct btrfs_disk_key right_key;
 			btrfs_node_key(right, &right_key, 0);
 			ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
-					BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
+					BTRFS_MOD_LOG_KEY_REPLACE);
 			BUG_ON(ret < 0);
 			btrfs_set_node_key(parent, &right_key, pslot + 1);
 			btrfs_mark_buffer_dirty(parent);
@@ -1064,7 +1064,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		struct btrfs_disk_key mid_key;
 		btrfs_node_key(mid, &mid_key, 0);
 		ret = btrfs_tree_mod_log_insert_key(parent, pslot,
-				BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
+						    BTRFS_MOD_LOG_KEY_REPLACE);
 		BUG_ON(ret < 0);
 		btrfs_set_node_key(parent, &mid_key, pslot);
 		btrfs_mark_buffer_dirty(parent);
@@ -1166,7 +1166,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 			orig_slot += left_nr;
 			btrfs_node_key(mid, &disk_key, 0);
 			ret = btrfs_tree_mod_log_insert_key(parent, pslot,
-					BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
+					BTRFS_MOD_LOG_KEY_REPLACE);
 			BUG_ON(ret < 0);
 			btrfs_set_node_key(parent, &disk_key, pslot);
 			btrfs_mark_buffer_dirty(parent);
@@ -1220,7 +1220,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 
 			btrfs_node_key(right, &disk_key, 0);
 			ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
-					BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
+					BTRFS_MOD_LOG_KEY_REPLACE);
 			BUG_ON(ret < 0);
 			btrfs_set_node_key(parent, &disk_key, pslot + 1);
 			btrfs_mark_buffer_dirty(parent);
@@ -2418,7 +2418,7 @@ static void fixup_low_keys(struct btrfs_path *path,
 			break;
 		t = path->nodes[i];
 		ret = btrfs_tree_mod_log_insert_key(t, tslot,
-				BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
+						    BTRFS_MOD_LOG_KEY_REPLACE);
 		BUG_ON(ret < 0);
 		btrfs_set_node_key(t, key, tslot);
 		btrfs_mark_buffer_dirty(path->nodes[i]);
@@ -2779,7 +2779,7 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
 	}
 	if (level) {
 		ret = btrfs_tree_mod_log_insert_key(lower, slot,
-					    BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS);
+						    BTRFS_MOD_LOG_KEY_ADD);
 		BUG_ON(ret < 0);
 	}
 	btrfs_set_node_key(lower, key, slot);
@@ -4219,7 +4219,7 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
 			      (nritems - slot - 1));
 	} else if (level) {
 		ret = btrfs_tree_mod_log_insert_key(parent, slot,
-				BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS);
+						    BTRFS_MOD_LOG_KEY_REMOVE);
 		BUG_ON(ret < 0);
 	}
 
diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c
index 8a3a14686d3e..18b4699bcb11 100644
--- a/fs/btrfs/tree-mod-log.c
+++ b/fs/btrfs/tree-mod-log.c
@@ -197,12 +197,11 @@ static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
 
 static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb,
 						 int slot,
-						 enum btrfs_mod_log_op op,
-						 gfp_t flags)
+						 enum btrfs_mod_log_op op)
 {
 	struct tree_mod_elem *tm;
 
-	tm = kzalloc(sizeof(*tm), flags);
+	tm = kzalloc(sizeof(*tm), GFP_NOFS);
 	if (!tm)
 		return NULL;
 
@@ -220,7 +219,7 @@ static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb,
 }
 
 int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
-				  enum btrfs_mod_log_op op, gfp_t flags)
+				  enum btrfs_mod_log_op op)
 {
 	struct tree_mod_elem *tm;
 	int ret;
@@ -228,7 +227,7 @@ int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
 	if (!tree_mod_need_log(eb->fs_info, eb))
 		return 0;
 
-	tm = alloc_tree_mod_elem(eb, slot, op, flags);
+	tm = alloc_tree_mod_elem(eb, slot, op);
 	if (!tm)
 		return -ENOMEM;
 
@@ -276,7 +275,7 @@ int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
 
 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
 		tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
-				BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
+				BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING);
 		if (!tm_list[i]) {
 			ret = -ENOMEM;
 			goto free_tms;
@@ -364,7 +363,7 @@ int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
 		}
 		for (i = 0; i < nritems; i++) {
 			tm_list[i] = alloc_tree_mod_elem(old_root, i,
-			    BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
+			    BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
 			if (!tm_list[i]) {
 				ret = -ENOMEM;
 				goto free_tms;
@@ -502,14 +501,14 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
 	tm_list_rem = tm_list + nr_items;
 	for (i = 0; i < nr_items; i++) {
 		tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
-		    BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS);
+						     BTRFS_MOD_LOG_KEY_REMOVE);
 		if (!tm_list_rem[i]) {
 			ret = -ENOMEM;
 			goto free_tms;
 		}
 
 		tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
-						BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS);
+						     BTRFS_MOD_LOG_KEY_ADD);
 		if (!tm_list_add[i]) {
 			ret = -ENOMEM;
 			goto free_tms;
@@ -564,7 +563,7 @@ int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb)
 
 	for (i = 0; i < nritems; i++) {
 		tm_list[i] = alloc_tree_mod_elem(eb, i,
-		    BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
+				    BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
 		if (!tm_list[i]) {
 			ret = -ENOMEM;
 			goto free_tms;
diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h
index 12605d19621b..8cffe0bc2a39 100644
--- a/fs/btrfs/tree-mod-log.h
+++ b/fs/btrfs/tree-mod-log.h
@@ -32,7 +32,7 @@ int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
 				   struct extent_buffer *new_root,
 				   bool log_removal);
 int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
-				  enum btrfs_mod_log_op op, gfp_t flags);
+				  enum btrfs_mod_log_op op);
 int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb);
 struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
 						struct btrfs_path *path,
-- 
cgit 


From 5565b8e0adcdd8ee32d02d82d98cdf6d966793c7 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Wed, 12 Oct 2022 17:22:35 +0800
Subject: btrfs: make module init/exit match their sequence

[BACKGROUND]
In theory init_btrfs_fs() and exit_btrfs_fs() should match their
sequence, thus normally they should look like this:

    init_btrfs_fs()   |   exit_btrfs_fs()
----------------------+------------------------
    init_A();         |
    init_B();         |
    init_C();         |
                      |   exit_C();
                      |   exit_B();
                      |   exit_A();

So is for the error path of init_btrfs_fs().

But it's not the case, some exit functions don't match their init
functions sequence in init_btrfs_fs().

Furthermore in init_btrfs_fs(), we need to have a new error label for
each new init function we added.  This is not really expandable,
especially recently we may add several new functions to init_btrfs_fs().

[ENHANCEMENT]
The patch will introduce the following things to enhance the situation:

- struct init_sequence
  Just a wrapper of init and exit function pointers.

  The init function must use int type as return value, thus some init
  functions need to be updated to return 0.

  The exit function can be NULL, as there are some init sequence just
  outputting a message.

- struct mod_init_seq[] array
  This is a const array, recording all the initialization we need to do
  in init_btrfs_fs(), and the order follows the old init_btrfs_fs().

- bool mod_init_result[] array
  This is a bool array, recording if we have initialized one entry in
  mod_init_seq[].

  The reason to split mod_init_seq[] and mod_init_result[] is to avoid
  section mismatch in reference.

  All init function are in .init.text, but if mod_init_seq[] records
  the @initialized member it can no longer be const, thus will be put
  into .data section, and cause modpost warning.

For init_btrfs_fs() we just call all init functions in their order in
mod_init_seq[] array, and after each call, setting corresponding
mod_init_result[] to true.

For exit_btrfs_fs() and error handling path of init_btrfs_fs(), we just
iterate mod_init_seq[] in reverse order, and skip all uninitialized
entry.

With this patch, init_btrfs_fs()/exit_btrfs_fs() will be much easier to
expand and will always follow the strict order.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c |   3 +-
 fs/btrfs/compression.h |   2 +-
 fs/btrfs/props.c       |   3 +-
 fs/btrfs/props.h       |   2 +-
 fs/btrfs/super.c       | 242 ++++++++++++++++++++++++-------------------------
 5 files changed, 122 insertions(+), 130 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index e6635fe70067..65a4e5087215 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -1243,12 +1243,13 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
 	return ret;
 }
 
-void __init btrfs_init_compress(void)
+int __init btrfs_init_compress(void)
 {
 	btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE);
 	btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB);
 	btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO);
 	zstd_init_workspace_manager();
+	return 0;
 }
 
 void __cold btrfs_exit_compress(void)
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 1aa02903de69..9da2343eeff5 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -77,7 +77,7 @@ static inline unsigned int btrfs_compress_level(unsigned int type_level)
 	return ((type_level & 0xF0) >> 4);
 }
 
-void __init btrfs_init_compress(void);
+int __init btrfs_init_compress(void);
 void __cold btrfs_exit_compress(void);
 
 int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 07f62e3ba6a5..e04289347775 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -454,7 +454,7 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-void __init btrfs_props_init(void)
+int __init btrfs_props_init(void)
 {
 	int i;
 
@@ -464,5 +464,6 @@ void __init btrfs_props_init(void)
 
 		hash_add(prop_handlers_ht, &p->node, h);
 	}
+	return 0;
 }
 
diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h
index ca9dd3df129b..6e283196e38a 100644
--- a/fs/btrfs/props.h
+++ b/fs/btrfs/props.h
@@ -8,7 +8,7 @@
 
 #include "ctree.h"
 
-void __init btrfs_props_init(void);
+int __init btrfs_props_init(void);
 
 int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode,
 		   const char *name, const char *value, size_t value_len,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0a93fbd29494..3a33dd9847d9 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2694,7 +2694,7 @@ static __cold void btrfs_interface_exit(void)
 	misc_deregister(&btrfs_misc);
 }
 
-static void __init btrfs_print_mod_info(void)
+static int __init btrfs_print_mod_info(void)
 {
 	static const char options[] = ""
 #ifdef CONFIG_BTRFS_DEBUG
@@ -2721,143 +2721,133 @@ static void __init btrfs_print_mod_info(void)
 #endif
 			;
 	pr_info("Btrfs loaded, crc32c=%s%s\n", crc32c_impl(), options);
+	return 0;
 }
 
-static int __init init_btrfs_fs(void)
+static int register_btrfs(void)
 {
-	int err;
-
-	btrfs_props_init();
-
-	err = btrfs_init_sysfs();
-	if (err)
-		return err;
-
-	btrfs_init_compress();
-
-	err = btrfs_init_cachep();
-	if (err)
-		goto free_compress;
-
-	err = btrfs_transaction_init();
-	if (err)
-		goto free_cachep;
-
-	err = btrfs_ctree_init();
-	if (err)
-		goto free_transaction;
-
-	err = btrfs_free_space_init();
-	if (err)
-		goto free_ctree;
-
-	err = extent_state_init_cachep();
-	if (err)
-		goto free_free_space;
-
-	err = extent_buffer_init_cachep();
-	if (err)
-		goto free_extent_cachep;
-
-	err = btrfs_bioset_init();
-	if (err)
-		goto free_eb_cachep;
-
-	err = extent_map_init();
-	if (err)
-		goto free_bioset;
-
-	err = ordered_data_init();
-	if (err)
-		goto free_extent_map;
-
-	err = btrfs_delayed_inode_init();
-	if (err)
-		goto free_ordered_data;
-
-	err = btrfs_auto_defrag_init();
-	if (err)
-		goto free_delayed_inode;
-
-	err = btrfs_delayed_ref_init();
-	if (err)
-		goto free_auto_defrag;
-
-	err = btrfs_prelim_ref_init();
-	if (err)
-		goto free_delayed_ref;
-
-	err = btrfs_interface_init();
-	if (err)
-		goto free_prelim_ref;
+	return register_filesystem(&btrfs_fs_type);
+}
 
-	btrfs_print_mod_info();
+static void unregister_btrfs(void)
+{
+	unregister_filesystem(&btrfs_fs_type);
+}
 
-	err = btrfs_run_sanity_tests();
-	if (err)
-		goto unregister_ioctl;
+/* Helper structure for long init/exit functions. */
+struct init_sequence {
+	int (*init_func)(void);
+	/* Can be NULL if the init_func doesn't need cleanup. */
+	void (*exit_func)(void);
+};
 
-	err = register_filesystem(&btrfs_fs_type);
-	if (err)
-		goto unregister_ioctl;
+static const struct init_sequence mod_init_seq[] = {
+	{
+		.init_func = btrfs_props_init,
+		.exit_func = NULL,
+	}, {
+		.init_func = btrfs_init_sysfs,
+		.exit_func = btrfs_exit_sysfs,
+	}, {
+		.init_func = btrfs_init_compress,
+		.exit_func = btrfs_exit_compress,
+	}, {
+		.init_func = btrfs_init_cachep,
+		.exit_func = btrfs_destroy_cachep,
+	}, {
+		.init_func = btrfs_transaction_init,
+		.exit_func = btrfs_transaction_exit,
+	}, {
+		.init_func = btrfs_ctree_init,
+		.exit_func = btrfs_ctree_exit,
+	}, {
+		.init_func = btrfs_free_space_init,
+		.exit_func = btrfs_free_space_exit,
+	}, {
+		.init_func = extent_state_init_cachep,
+		.exit_func = extent_state_free_cachep,
+	}, {
+		.init_func = extent_buffer_init_cachep,
+		.exit_func = extent_buffer_free_cachep,
+	}, {
+		.init_func = btrfs_bioset_init,
+		.exit_func = btrfs_bioset_exit,
+	}, {
+		.init_func = extent_map_init,
+		.exit_func = extent_map_exit,
+	}, {
+		.init_func = ordered_data_init,
+		.exit_func = ordered_data_exit,
+	}, {
+		.init_func = btrfs_delayed_inode_init,
+		.exit_func = btrfs_delayed_inode_exit,
+	}, {
+		.init_func = btrfs_auto_defrag_init,
+		.exit_func = btrfs_auto_defrag_exit,
+	}, {
+		.init_func = btrfs_delayed_ref_init,
+		.exit_func = btrfs_delayed_ref_exit,
+	}, {
+		.init_func = btrfs_prelim_ref_init,
+		.exit_func = btrfs_prelim_ref_exit,
+	}, {
+		.init_func = btrfs_interface_init,
+		.exit_func = btrfs_interface_exit,
+	}, {
+		.init_func = btrfs_print_mod_info,
+		.exit_func = NULL,
+	}, {
+		.init_func = btrfs_run_sanity_tests,
+		.exit_func = NULL,
+	}, {
+		.init_func = register_btrfs,
+		.exit_func = unregister_btrfs,
+	}
+};
 
-	return 0;
+static bool mod_init_result[ARRAY_SIZE(mod_init_seq)];
 
-unregister_ioctl:
-	btrfs_interface_exit();
-free_prelim_ref:
-	btrfs_prelim_ref_exit();
-free_delayed_ref:
-	btrfs_delayed_ref_exit();
-free_auto_defrag:
-	btrfs_auto_defrag_exit();
-free_delayed_inode:
-	btrfs_delayed_inode_exit();
-free_ordered_data:
-	ordered_data_exit();
-free_extent_map:
-	extent_map_exit();
-free_bioset:
-	btrfs_bioset_exit();
-free_eb_cachep:
-	extent_buffer_free_cachep();
-free_extent_cachep:
-	extent_state_free_cachep();
-free_free_space:
-	btrfs_free_space_exit();
-free_ctree:
-	btrfs_ctree_exit();
-free_transaction:
-	btrfs_transaction_exit();
-free_cachep:
-	btrfs_destroy_cachep();
-free_compress:
-	btrfs_exit_compress();
-	btrfs_exit_sysfs();
+static void __exit exit_btrfs_fs(void)
+{
+	int i;
 
-	return err;
+	for (i = ARRAY_SIZE(mod_init_seq) - 1; i >= 0; i--) {
+		if (!mod_init_result[i])
+			continue;
+		if (mod_init_seq[i].exit_func)
+			mod_init_seq[i].exit_func();
+		mod_init_result[i] = false;
+	}
 }
 
-static void __exit exit_btrfs_fs(void)
+static int __init init_btrfs_fs(void)
 {
-	btrfs_free_space_exit();
-	btrfs_ctree_exit();
-	btrfs_transaction_exit();
-	btrfs_destroy_cachep();
-	btrfs_delayed_ref_exit();
-	btrfs_auto_defrag_exit();
-	btrfs_delayed_inode_exit();
-	btrfs_prelim_ref_exit();
-	ordered_data_exit();
-	extent_map_exit();
-	btrfs_bioset_exit();
-	extent_state_free_cachep();
-	extent_buffer_free_cachep();
-	btrfs_interface_exit();
-	unregister_filesystem(&btrfs_fs_type);
-	btrfs_exit_sysfs();
-	btrfs_cleanup_fs_uuids();
-	btrfs_exit_compress();
+	int ret;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(mod_init_seq); i++) {
+		ASSERT(!mod_init_result[i]);
+		ret = mod_init_seq[i].init_func();
+		if (ret < 0)
+			goto error;
+		mod_init_result[i] = true;
+	}
+	return 0;
+
+error:
+	/*
+	 * If we call exit_btrfs_fs() it would cause section mismatch.
+	 * As init_btrfs_fs() belongs to .init.text, while exit_btrfs_fs()
+	 * belongs to .exit.text.
+	 */
+	for (i = ARRAY_SIZE(mod_init_seq) - 1; i >= 0; i--) {
+		if (!mod_init_result[i])
+			continue;
+		if (mod_init_seq[i].exit_func)
+			mod_init_seq[i].exit_func();
+		mod_init_result[i] = false;
+	}
+	return ret;
 }
 
 late_initcall(init_btrfs_fs);
-- 
cgit 


From d549ff7bdbe72f5e0e934895090fed2647e9813b Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 9 Sep 2022 17:20:25 +0200
Subject: btrfs: add helper for bit enumeration

Define helper macro that can be used in enum {} to utilize the automatic
increment to define all bits without directly defining the values or
using additional linear bits.

1. capture the sequence value, N
2. use the value to define the given enum with N-th bit set
3. reset the sequence back to N

Use for enums that do not require fixed values for symbolic names (like
for on-disk structures):

enum {
	ENUM_BIT(FIRST),
	ENUM_BIT(SECOND),
	ENUM_BIT(THIRD)
};

Where the values would be 0x1, 0x2 and 0x4.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/misc.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h
index f9850edfd726..69826ccd6644 100644
--- a/fs/btrfs/misc.h
+++ b/fs/btrfs/misc.h
@@ -10,6 +10,14 @@
 
 #define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len))
 
+/*
+ * Enumerate bits using enum autoincrement. Define the @name as the n-th bit.
+ */
+#define ENUM_BIT(name)                                  \
+	__ ## name ## _BIT,                             \
+	name = (1U << __ ## name ## _BIT),              \
+	__ ## name ## _SEQ = __ ## name ## _BIT
+
 static inline void cond_wake_up(struct wait_queue_head *wq)
 {
 	/*
-- 
cgit 


From c7321b76dfcc8a1b8d7dad3eff3595bad28252e2 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 9 Sep 2022 17:27:45 +0200
Subject: btrfs: convert BTRFS_ILOCK-* defines to enum bit

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c0c0dc8fa99f..5adf76d89f9b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -34,6 +34,7 @@
 #include "async-thread.h"
 #include "block-rsv.h"
 #include "locking.h"
+#include "misc.h"
 
 struct btrfs_trans_handle;
 struct btrfs_transaction;
@@ -3131,9 +3132,11 @@ struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
 extern const struct dentry_operations btrfs_dentry_operations;
 
 /* Inode locking type flags, by default the exclusive lock is taken */
-#define BTRFS_ILOCK_SHARED	(1U << 0)
-#define BTRFS_ILOCK_TRY 	(1U << 1)
-#define BTRFS_ILOCK_MMAP	(1U << 2)
+enum btrfs_ilock_type {
+	ENUM_BIT(BTRFS_ILOCK_SHARED),
+	ENUM_BIT(BTRFS_ILOCK_TRY),
+	ENUM_BIT(BTRFS_ILOCK_MMAP),
+};
 
 int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags);
 void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags);
-- 
cgit 


From fd8d2951f478c095f5e65c5f14f2be4d4724c5ea Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 9 Sep 2022 17:31:38 +0200
Subject: btrfs: convert extent_io page op defines to enum bits

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.h | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7929f054dda3..a5ec1475988f 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -9,6 +9,7 @@
 #include <linux/btrfs_tree.h>
 #include "compression.h"
 #include "ulist.h"
+#include "misc.h"
 
 enum {
 	EXTENT_BUFFER_UPTODATE,
@@ -29,13 +30,15 @@ enum {
 };
 
 /* these are flags for __process_pages_contig */
-#define PAGE_UNLOCK		(1 << 0)
-/* Page starts writeback, clear dirty bit and set writeback bit */
-#define PAGE_START_WRITEBACK	(1 << 1)
-#define PAGE_END_WRITEBACK	(1 << 2)
-#define PAGE_SET_ORDERED	(1 << 3)
-#define PAGE_SET_ERROR		(1 << 4)
-#define PAGE_LOCK		(1 << 5)
+enum {
+	ENUM_BIT(PAGE_UNLOCK),
+	/* Page starts writeback, clear dirty bit and set writeback bit */
+	ENUM_BIT(PAGE_START_WRITEBACK),
+	ENUM_BIT(PAGE_END_WRITEBACK),
+	ENUM_BIT(PAGE_SET_ORDERED),
+	ENUM_BIT(PAGE_SET_ERROR),
+	ENUM_BIT(PAGE_LOCK),
+};
 
 /*
  * page->private values.  Every page that is controlled by the extent
-- 
cgit 


From d3b4d0fd5518fd000938c6cbd4817085c210dcbe Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 9 Sep 2022 17:34:58 +0200
Subject: btrfs: convert EXTENT_* bits to enums

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-io-tree.h | 71 +++++++++++++++++++++++++----------------------
 1 file changed, 38 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index c71aa29f719d..673637ed2fa9 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -3,43 +3,48 @@
 #ifndef BTRFS_EXTENT_IO_TREE_H
 #define BTRFS_EXTENT_IO_TREE_H
 
+#include "misc.h"
+
 struct extent_changeset;
 struct io_failure_record;
 
 /* Bits for the extent state */
-#define EXTENT_DIRTY		(1U << 0)
-#define EXTENT_UPTODATE		(1U << 1)
-#define EXTENT_LOCKED		(1U << 2)
-#define EXTENT_NEW		(1U << 3)
-#define EXTENT_DELALLOC		(1U << 4)
-#define EXTENT_DEFRAG		(1U << 5)
-#define EXTENT_BOUNDARY		(1U << 6)
-#define EXTENT_NODATASUM	(1U << 7)
-#define EXTENT_CLEAR_META_RESV	(1U << 8)
-#define EXTENT_NEED_WAIT	(1U << 9)
-#define EXTENT_NORESERVE	(1U << 11)
-#define EXTENT_QGROUP_RESERVED	(1U << 12)
-#define EXTENT_CLEAR_DATA_RESV	(1U << 13)
-/*
- * Must be cleared only during ordered extent completion or on error paths if we
- * did not manage to submit bios and create the ordered extents for the range.
- * Should not be cleared during page release and page invalidation (if there is
- * an ordered extent in flight), that is left for the ordered extent completion.
- */
-#define EXTENT_DELALLOC_NEW	(1U << 14)
-/*
- * When an ordered extent successfully completes for a region marked as a new
- * delalloc range, use this flag when clearing a new delalloc range to indicate
- * that the VFS' inode number of bytes should be incremented and the inode's new
- * delalloc bytes decremented, in an atomic way to prevent races with stat(2).
- */
-#define EXTENT_ADD_INODE_BYTES  (1U << 15)
-
-/*
- * Set during truncate when we're clearing an entire range and we just want the
- * extent states to go away.
- */
-#define EXTENT_CLEAR_ALL_BITS	(1U << 16)
+enum {
+	ENUM_BIT(EXTENT_DIRTY),
+	ENUM_BIT(EXTENT_UPTODATE),
+	ENUM_BIT(EXTENT_LOCKED),
+	ENUM_BIT(EXTENT_NEW),
+	ENUM_BIT(EXTENT_DELALLOC),
+	ENUM_BIT(EXTENT_DEFRAG),
+	ENUM_BIT(EXTENT_BOUNDARY),
+	ENUM_BIT(EXTENT_NODATASUM),
+	ENUM_BIT(EXTENT_CLEAR_META_RESV),
+	ENUM_BIT(EXTENT_NEED_WAIT),
+	ENUM_BIT(EXTENT_NORESERVE),
+	ENUM_BIT(EXTENT_QGROUP_RESERVED),
+	ENUM_BIT(EXTENT_CLEAR_DATA_RESV),
+	/*
+	 * Must be cleared only during ordered extent completion or on error
+	 * paths if we did not manage to submit bios and create the ordered
+	 * extents for the range.  Should not be cleared during page release
+	 * and page invalidation (if there is an ordered extent in flight),
+	 * that is left for the ordered extent completion.
+	 */
+	ENUM_BIT(EXTENT_DELALLOC_NEW),
+	/*
+	 * When an ordered extent successfully completes for a region marked as
+	 * a new delalloc range, use this flag when clearing a new delalloc
+	 * range to indicate that the VFS' inode number of bytes should be
+	 * incremented and the inode's new delalloc bytes decremented, in an
+	 * atomic way to prevent races with stat(2).
+	 */
+	ENUM_BIT(EXTENT_ADD_INODE_BYTES),
+	/*
+	 * Set during truncate when we're clearing an entire range and we just
+	 * want the extent states to go away.
+	 */
+	ENUM_BIT(EXTENT_CLEAR_ALL_BITS),
+};
 
 #define EXTENT_DO_ACCOUNTING    (EXTENT_CLEAR_META_RESV | \
 				 EXTENT_CLEAR_DATA_RESV)
-- 
cgit 


From e0a8b9a74767f748881b1e2408ed9afe7a26d9fd Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 9 Sep 2022 17:37:35 +0200
Subject: btrfs: convert QGROUP_* defines to enum bits

The defines/enums are used only for tracepoints and are not part of the
on-disk format.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 578c77e94200..3fb5459c9309 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -11,6 +11,7 @@
 #include <linux/kobject.h>
 #include "ulist.h"
 #include "delayed-ref.h"
+#include "misc.h"
 
 /*
  * Btrfs qgroup overview
@@ -242,9 +243,11 @@ static inline u64 btrfs_qgroup_subvolid(u64 qgroupid)
 /*
  * For qgroup event trace points only
  */
-#define QGROUP_RESERVE		(1<<0)
-#define QGROUP_RELEASE		(1<<1)
-#define QGROUP_FREE		(1<<2)
+enum {
+	ENUM_BIT(QGROUP_RESERVE),
+	ENUM_BIT(QGROUP_RELEASE),
+	ENUM_BIT(QGROUP_FREE),
+};
 
 int btrfs_quota_enable(struct btrfs_fs_info *fs_info);
 int btrfs_quota_disable(struct btrfs_fs_info *fs_info);
-- 
cgit 


From cc37ea61920eca87316be1ae255e5e5e7b12f7b6 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 9 Sep 2022 17:43:06 +0200
Subject: btrfs: convert __TRANS_* defines to enum bits

The base transaction bits can be defined as bits in a contiguous
sequence, although right now there's a hole from bit 1 to 8.

The bits are used for btrfs_trans_handle::type, and there's another set
of TRANS_STATE_* defines that are for btrfs_transaction::state. They are
mutually exclusive though the hole in the sequence looks like was made
for the states.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/transaction.h | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 1332f193b800..b39ebf98af60 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -10,6 +10,7 @@
 #include "btrfs_inode.h"
 #include "delayed-ref.h"
 #include "ctree.h"
+#include "misc.h"
 
 enum btrfs_trans_state {
 	TRANS_STATE_RUNNING,
@@ -98,14 +99,15 @@ struct btrfs_transaction {
 	struct list_head releasing_ebs;
 };
 
-#define __TRANS_FREEZABLE	(1U << 0)
-
-#define __TRANS_START		(1U << 9)
-#define __TRANS_ATTACH		(1U << 10)
-#define __TRANS_JOIN		(1U << 11)
-#define __TRANS_JOIN_NOLOCK	(1U << 12)
-#define __TRANS_DUMMY		(1U << 13)
-#define __TRANS_JOIN_NOSTART	(1U << 14)
+enum {
+	ENUM_BIT(__TRANS_FREEZABLE),
+	ENUM_BIT(__TRANS_START),
+	ENUM_BIT(__TRANS_ATTACH),
+	ENUM_BIT(__TRANS_JOIN),
+	ENUM_BIT(__TRANS_JOIN_NOLOCK),
+	ENUM_BIT(__TRANS_DUMMY),
+	ENUM_BIT(__TRANS_JOIN_NOSTART),
+};
 
 #define TRANS_START		(__TRANS_START | __TRANS_FREEZABLE)
 #define TRANS_ATTACH		(__TRANS_ATTACH)
-- 
cgit 


From 7248e0cebbefaba94c0c37f708a134dad3acba0e Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Fri, 9 Sep 2022 14:45:22 +0800
Subject: btrfs: skip update of block group item if used bytes are the same

[BACKGROUND]

When committing a transaction, we will update block group items for all
dirty block groups.

But in fact, dirty block groups don't always need to update their block
group items.
It's pretty common to have a metadata block group which experienced
several COW operations, but still have the same amount of used bytes.

In that case, we may unnecessarily COW a tree block doing nothing.

[ENHANCEMENT]

This patch will introduce btrfs_block_group::commit_used member to
remember the last used bytes, and use that new member to skip
unnecessary block group item update.

This would be more common for large filesystems, where metadata block
group can be as large as 1GiB, containing at most 64K metadata items.

In that case, if COW added and then deleted one metadata item near the
end of the block group, then it's completely possible we don't need to
touch the block group item at all.

[BENCHMARK]

The change itself can have quite a high chance (20~80%) to skip block
group item updates in lot of workloads.

As a result, it would result shorter time spent on
btrfs_write_dirty_block_groups(), and overall reduce the execution time
of the critical section of btrfs_commit_transaction().

Here comes a fio command, which will do random writes in 4K block size,
causing a very heavy metadata updates.

fio --filename=$mnt/file --size=512M --rw=randwrite --direct=1 --bs=4k \
    --ioengine=libaio --iodepth=64 --runtime=300 --numjobs=4 \
    --name=random_write --fallocate=none --time_based --fsync_on_close=1

The file size (512M) and number of threads (4) means 2GiB file size in
total, but during the full 300s run time, my dedicated SATA SSD is able
to write around 20~25GiB, which is over 10 times the file size.

Thus after we fill the initial 2G, we should not cause much block group
item updates.

Please note, the fio numbers by themselves don't have much change, but
if we look deeper, there is some reduced execution time, especially for
the critical section of btrfs_commit_transaction().

I added extra trace_printk() to measure the following per-transaction
execution time:

- Critical section of btrfs_commit_transaction()
  By re-using the existing update_commit_stats() function, which
  has already calculated the interval correctly.

- The while() loop for btrfs_write_dirty_block_groups()
  Although this includes the execution time of btrfs_run_delayed_refs(),
  it should still be representative overall.

Both result involves transid 7~30, the same amount of transaction
committed.

The result looks like this:

                      |      Before       |     After      |  Diff
----------------------+-------------------+----------------+--------
Transaction interval  | 229247198.5       | 215016933.6    | -6.2%
Block group interval  | 23133.33333       | 18970.83333    | -18.0%

The change in block group item updates is more obvious, as skipped block
group item updates also mean less delayed refs.

And the overall execution time for that block group update loop is
pretty small, thus we can assume the extent tree is already mostly
cached.  If we can skip an uncached tree block, it would cause more
obvious change.

Unfortunately the overall reduction in commit transaction critical
section is much smaller, as the block group item updates loop is not
really the major part, at least not for the above fio script.

But still we have a observable reduction in the critical section.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c | 28 +++++++++++++++++++++++++++-
 fs/btrfs/block-group.h |  6 ++++++
 2 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index b1f4acdeafb9..47461fd21975 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -2071,6 +2071,7 @@ static int read_one_block_group(struct btrfs_fs_info *info,
 
 	cache->length = key->offset;
 	cache->used = btrfs_stack_block_group_used(bgi);
+	cache->commit_used = cache->used;
 	cache->flags = btrfs_stack_block_group_flags(bgi);
 	cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi);
 
@@ -2762,6 +2763,25 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
 	struct extent_buffer *leaf;
 	struct btrfs_block_group_item bgi;
 	struct btrfs_key key;
+	u64 old_commit_used;
+	u64 used;
+
+	/*
+	 * Block group items update can be triggered out of commit transaction
+	 * critical section, thus we need a consistent view of used bytes.
+	 * We cannot use cache->used directly outside of the spin lock, as it
+	 * may be changed.
+	 */
+	spin_lock(&cache->lock);
+	old_commit_used = cache->commit_used;
+	used = cache->used;
+	/* No change in used bytes, can safely skip it. */
+	if (cache->commit_used == used) {
+		spin_unlock(&cache->lock);
+		return 0;
+	}
+	cache->commit_used = used;
+	spin_unlock(&cache->lock);
 
 	key.objectid = cache->start;
 	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
@@ -2776,7 +2796,7 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
 
 	leaf = path->nodes[0];
 	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
-	btrfs_set_stack_block_group_used(&bgi, cache->used);
+	btrfs_set_stack_block_group_used(&bgi, used);
 	btrfs_set_stack_block_group_chunk_objectid(&bgi,
 						   cache->global_root_id);
 	btrfs_set_stack_block_group_flags(&bgi, cache->flags);
@@ -2784,6 +2804,12 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(leaf);
 fail:
 	btrfs_release_path(path);
+	/* We didn't update the block group item, need to revert @commit_used. */
+	if (ret < 0) {
+		spin_lock(&cache->lock);
+		cache->commit_used = old_commit_used;
+		spin_unlock(&cache->lock);
+	}
 	return ret;
 
 }
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 39e79bed10ae..6c970a486b68 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -99,6 +99,12 @@ struct btrfs_block_group {
 	u64 cache_generation;
 	u64 global_root_id;
 
+	/*
+	 * The last committed used bytes of this block group, if the above @used
+	 * is still the same as @commit_used, we don't need to update block
+	 * group item of this block group.
+	 */
+	u64 commit_used;
 	/*
 	 * If the free space extent count exceeds this number, convert the block
 	 * group to bitmaps.
-- 
cgit 


From 48acc47d7813a0e650754845161f04b0b27ff8ac Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 14 Oct 2022 10:00:39 -0400
Subject: btrfs: do not use GFP_ATOMIC in the read endio

We have done read endio in an async thread for a very, very long time,
which makes the use of GFP_ATOMIC and unlock_extent_atomic() unneeded in
our read endio path.  We've noticed under heavy memory pressure in our
fleet that we can fail these allocations, and then often trip a
BUG_ON(!allocation), which isn't an ideal outcome.  Begin to address
this by simply not using GFP_ATOMIC, which will allow us to do things
like actually allocate a extent state when doing
set_extent_bits(UPTODATE) in the endio handler.

End io handlers are not called in atomic context, besides we have been
allocating failrec with GFP_NOFS so we'd notice there's a problem.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4e4f28387ace..78d7ea10621d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -897,9 +897,9 @@ static void end_sector_io(struct page *page, u64 offset, bool uptodate)
 	end_page_read(page, uptodate, offset, sectorsize);
 	if (uptodate)
 		set_extent_uptodate(&inode->io_tree, offset,
-				    offset + sectorsize - 1, &cached, GFP_ATOMIC);
-	unlock_extent_atomic(&inode->io_tree, offset, offset + sectorsize - 1,
-			     &cached);
+				    offset + sectorsize - 1, &cached, GFP_NOFS);
+	unlock_extent(&inode->io_tree, offset, offset + sectorsize - 1,
+		      &cached);
 }
 
 static void submit_data_read_repair(struct inode *inode,
@@ -1103,7 +1103,7 @@ static void endio_readpage_release_extent(struct processed_extent *processed,
 	 * Now we don't have range contiguous to the processed range, release
 	 * the processed range now.
 	 */
-	unlock_extent_atomic(tree, processed->start, processed->end, &cached);
+	unlock_extent(tree, processed->start, processed->end, &cached);
 
 update:
 	/* Update processed to current range */
-- 
cgit 


From da2a071b6f142f47072ea7b37e03d3ef317b8a4e Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 14 Oct 2022 10:00:40 -0400
Subject: btrfs: remove unused unlock_extent_atomic

As of "btrfs: do not use GFP_ATOMIC in the read endio" we no longer have
any users of unlock_extent_atomic, remove it.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-io-tree.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index 673637ed2fa9..d73ef24bad2e 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -145,13 +145,6 @@ static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
 				  GFP_NOFS, NULL);
 }
 
-static inline int unlock_extent_atomic(struct extent_io_tree *tree, u64 start,
-				       u64 end, struct extent_state **cached)
-{
-	return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached,
-				  GFP_ATOMIC, NULL);
-}
-
 static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
 				    u64 end, u32 bits)
 {
-- 
cgit 


From 5a75034e71ef5ec0fce983afcb6c9cb0147cd5b9 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 14 Oct 2022 10:00:41 -0400
Subject: btrfs: do not panic if we can't allocate a prealloc extent state

We sometimes have to allocate new extent states when clearing or setting
new bits in an extent io tree.  Generally we preallocate this before
taking the tree spin lock, but we can use this preallocated extent state
sometimes and then need to try to do a GFP_ATOMIC allocation under the
lock.

Unfortunately sometimes this fails, and then we hit the BUG_ON() and
bring the box down.  This happens roughly 20 times a week in our fleet.

However the vast majority of callers use GFP_NOFS, which means that if
this GFP_ATOMIC allocation fails, we could simply drop the spin lock, go
back and allocate a new extent state with our given gfp mask, and begin
again from where we left off.

For the remaining callers that do not use GFP_NOFS, they are generally
using GFP_NOWAIT, which still allows for some reclaim.  So allow these
allocations to attempt to happen outside of the spin lock so we don't
need to rely on GFP_ATOMIC allocations.

This in essence creates an infinite loop for anything that isn't
GFP_NOFS.  To address this we may want to migrate to using mempools for
extent states so that we will always have emergency reserves in order to
make our allocations.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-io-tree.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index a630c771d25c..599db7b15574 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -572,7 +572,7 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
 		clear = 1;
 again:
-	if (!prealloc && gfpflags_allow_blocking(mask)) {
+	if (!prealloc) {
 		/*
 		 * Don't care for allocation failure here because we might end
 		 * up not needing the pre-allocated extent state at all, which
@@ -636,7 +636,8 @@ hit_next:
 
 	if (state->start < start) {
 		prealloc = alloc_extent_state_atomic(prealloc);
-		BUG_ON(!prealloc);
+		if (!prealloc)
+			goto search_again;
 		err = split_state(tree, state, prealloc, start);
 		if (err)
 			extent_io_tree_panic(tree, err);
@@ -657,7 +658,8 @@ hit_next:
 	 */
 	if (state->start <= end && state->end > end) {
 		prealloc = alloc_extent_state_atomic(prealloc);
-		BUG_ON(!prealloc);
+		if (!prealloc)
+			goto search_again;
 		err = split_state(tree, state, prealloc, end + 1);
 		if (err)
 			extent_io_tree_panic(tree, err);
@@ -987,7 +989,7 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	else
 		ASSERT(failed_start == NULL && failed_state == NULL);
 again:
-	if (!prealloc && gfpflags_allow_blocking(mask)) {
+	if (!prealloc) {
 		/*
 		 * Don't care for allocation failure here because we might end
 		 * up not needing the pre-allocated extent state at all, which
@@ -1012,7 +1014,8 @@ again:
 	state = tree_search_for_insert(tree, start, &p, &parent);
 	if (!state) {
 		prealloc = alloc_extent_state_atomic(prealloc);
-		BUG_ON(!prealloc);
+		if (!prealloc)
+			goto search_again;
 		prealloc->start = start;
 		prealloc->end = end;
 		insert_state_fast(tree, prealloc, p, parent, bits, changeset);
@@ -1085,7 +1088,8 @@ hit_next:
 		}
 
 		prealloc = alloc_extent_state_atomic(prealloc);
-		BUG_ON(!prealloc);
+		if (!prealloc)
+			goto search_again;
 		err = split_state(tree, state, prealloc, start);
 		if (err)
 			extent_io_tree_panic(tree, err);
@@ -1122,7 +1126,8 @@ hit_next:
 			this_end = last_start - 1;
 
 		prealloc = alloc_extent_state_atomic(prealloc);
-		BUG_ON(!prealloc);
+		if (!prealloc)
+			goto search_again;
 
 		/*
 		 * Avoid to free 'prealloc' if it can be merged with the later
@@ -1154,7 +1159,8 @@ hit_next:
 		}
 
 		prealloc = alloc_extent_state_atomic(prealloc);
-		BUG_ON(!prealloc);
+		if (!prealloc)
+			goto search_again;
 		err = split_state(tree, state, prealloc, end + 1);
 		if (err)
 			extent_io_tree_panic(tree, err);
-- 
cgit 


From 467761f90405004e58795116aa7d81f3d298dcfb Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 30 Sep 2022 17:23:01 +0200
Subject: btrfs: sysfs: convert remaining scnprintf to sysfs_emit

The sysfs_emit is the safe API for writing to the sysfs files,
previously converted from scnprintf, there's one left to do in
btrfs_read_policy_show.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/sysfs.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 74fef1f49c35..910702df7767 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1160,16 +1160,16 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj,
 
 	for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
 		if (fs_devices->read_policy == i)
-			ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s[%s]",
+			ret += sysfs_emit_at(buf, ret, "%s[%s]",
 					 (ret == 0 ? "" : " "),
 					 btrfs_read_policy_name[i]);
 		else
-			ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
+			ret += sysfs_emit_at(buf, ret, "%s%s",
 					 (ret == 0 ? "" : " "),
 					 btrfs_read_policy_name[i]);
 	}
 
-	ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
+	ret += sysfs_emit_at(buf, ret, "\n");
 
 	return ret;
 }
-- 
cgit 


From 63a7cb13071842966c1ce931edacbc23573aada5 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 26 Jul 2022 20:54:10 +0200
Subject: btrfs: auto enable discard=async when possible

There's a request to automatically enable async discard for capable
devices. We can do that, the async mode is designed to wait for larger
freed extents and is not intrusive, with limits to iops, kbps or latency.

The status and tunables will be exported in /sys/fs/btrfs/FSID/discard .

The automatic selection is done if there's at least one discard capable
device in the filesystem (not capable devices are skipped). Mounting
with any other discard option will honor that option, notably mounting
with nodiscard will keep it disabled.

Link: https://lore.kernel.org/linux-btrfs/CAEg-Je_b1YtdsCR0zS5XZ_SbvJgN70ezwvRwLiCZgDGLbeMB=w@mail.gmail.com/
Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h   |  1 +
 fs/btrfs/disk-io.c | 14 ++++++++++++++
 fs/btrfs/super.c   |  2 ++
 fs/btrfs/volumes.c |  3 +++
 fs/btrfs/volumes.h |  2 ++
 5 files changed, 22 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5adf76d89f9b..3485f60efa33 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1396,6 +1396,7 @@ enum {
 	BTRFS_MOUNT_DISCARD_ASYNC		= (1UL << 28),
 	BTRFS_MOUNT_IGNOREBADROOTS		= (1UL << 29),
 	BTRFS_MOUNT_IGNOREDATACSUMS		= (1UL << 30),
+	BTRFS_MOUNT_NODISCARD			= (1UL << 31),
 };
 
 #define BTRFS_DEFAULT_COMMIT_INTERVAL	(30)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5705335c80c8..cf59d8f14a05 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3746,6 +3746,20 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 		btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations");
 	}
 
+	/*
+	 * For devices supporting discard turn on discard=async automatically,
+	 * unless it's already set or disabled. This could be turned off by
+	 * nodiscard for the same mount.
+	 */
+	if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) ||
+	      btrfs_test_opt(fs_info, DISCARD_ASYNC) ||
+	      btrfs_test_opt(fs_info, NODISCARD)) &&
+	    fs_info->fs_devices->discardable) {
+		btrfs_set_and_info(fs_info, DISCARD_ASYNC,
+				   "auto enabling async discard");
+		btrfs_clear_opt(fs_info->mount_opt, NODISCARD);
+	}
+
 	/*
 	 * Mount does not set all options immediately, we can do it now and do
 	 * not have to wait for transaction commit
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 3a33dd9847d9..42e0d2fcc407 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -918,12 +918,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 				ret = -EINVAL;
 				goto out;
 			}
+			btrfs_clear_opt(info->mount_opt, NODISCARD);
 			break;
 		case Opt_nodiscard:
 			btrfs_clear_and_info(info, DISCARD_SYNC,
 					     "turning off discard");
 			btrfs_clear_and_info(info, DISCARD_ASYNC,
 					     "turning off async discard");
+			btrfs_set_opt(info->mount_opt, NODISCARD);
 			break;
 		case Opt_space_cache:
 		case Opt_space_cache_version:
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 635f45f1a2ef..796e9f5ff8f8 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -641,6 +641,9 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 	if (!bdev_nonrot(bdev))
 		fs_devices->rotating = true;
 
+	if (bdev_max_discard_sectors(bdev))
+		fs_devices->discardable = true;
+
 	device->bdev = bdev;
 	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
 	device->mode = flags;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 099def5613b8..a20ee7d57831 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -354,6 +354,8 @@ struct btrfs_fs_devices {
 	 * nonrot flag set
 	 */
 	bool rotating;
+	/* Devices support TRIM/discard commands */
+	bool discardable;
 
 	struct btrfs_fs_info *fs_info;
 	/* sysfs kobjects */
-- 
cgit 


From b307f06d37ca12495e4cfff8d456cd92f045f947 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 18 Oct 2022 16:06:38 +0200
Subject: btrfs: simplify generation check in btrfs_get_dentry

Callers that pass non-zero generation always want to perform the
generation check, we can simply encode that in one parameter and drop
check_generation. Add function documentation.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/export.c | 23 +++++++++++++++++------
 fs/btrfs/export.h |  3 +--
 fs/btrfs/ioctl.c  |  2 +-
 3 files changed, 19 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index fab7eb76e53b..a51a5dfa737c 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -57,9 +57,20 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
 	return type;
 }
 
+/*
+ * Read dentry of inode with @objectid from filesystem root @root_objectid.
+ *
+ * @sb:             the filesystem super block
+ * @objectid:       inode objectid
+ * @root_objectid:  object id of the subvolume root where to look up the inode
+ * @generation:     optional, if not zero, verify that the found inode
+ *                  generation matches
+ *
+ * Return dentry alias for the inode, otherwise an error. In case the
+ * generation does not match return ESTALE.
+ */
 struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
-				u64 root_objectid, u64 generation,
-				int check_generation)
+				u64 root_objectid, u64 generation)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 	struct btrfs_root *root;
@@ -77,7 +88,7 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
 
-	if (check_generation && generation != inode->i_generation) {
+	if (generation != 0 && generation != inode->i_generation) {
 		iput(inode);
 		return ERR_PTR(-ESTALE);
 	}
@@ -106,7 +117,7 @@ static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
 	objectid = fid->parent_objectid;
 	generation = fid->parent_gen;
 
-	return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
+	return btrfs_get_dentry(sb, objectid, root_objectid, generation);
 }
 
 static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
@@ -128,7 +139,7 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
 	root_objectid = fid->root_objectid;
 	generation = fid->gen;
 
-	return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
+	return btrfs_get_dentry(sb, objectid, root_objectid, generation);
 }
 
 struct dentry *btrfs_get_parent(struct dentry *child)
@@ -188,7 +199,7 @@ struct dentry *btrfs_get_parent(struct dentry *child)
 
 	if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
 		return btrfs_get_dentry(fs_info->sb, key.objectid,
-					found_key.offset, 0, 0);
+					found_key.offset, 0);
 	}
 
 	return d_obtain_alias(btrfs_iget(fs_info->sb, key.objectid, root));
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
index 5afb7ca42828..eba6bc4f5a61 100644
--- a/fs/btrfs/export.h
+++ b/fs/btrfs/export.h
@@ -19,8 +19,7 @@ struct btrfs_fid {
 } __attribute__ ((packed));
 
 struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
-				u64 root_objectid, u64 generation,
-				int check_generation);
+				u64 root_objectid, u64 generation);
 struct dentry *btrfs_get_parent(struct dentry *child);
 
 #endif
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 5ba2e810dc6e..a482739c1d82 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3274,7 +3274,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 
 			dentry = btrfs_get_dentry(fs_info->sb,
 					BTRFS_FIRST_FREE_OBJECTID,
-					vol_args2->subvolid, 0, 0);
+					vol_args2->subvolid, 0);
 			if (IS_ERR(dentry)) {
 				err = PTR_ERR(dentry);
 				goto out_drop_write;
-- 
cgit 


From 875c627c5f2045e2b5f0ad6692799e8fb931f3cb Mon Sep 17 00:00:00 2001
From: Wang Yugui <wangyugui@e16-tech.com>
Date: Wed, 19 Oct 2022 16:10:01 +0800
Subject: btrfs: send add define for v2 buffer size

Add a define for the data buffer size (though the maximum size is not
limited by it) BTRFS_SEND_BUF_SIZE_V2 so it's more visible.

Signed-off-by: Wang Yugui <wangyugui@e16-tech.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c | 2 +-
 fs/btrfs/send.h | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 1c4b693ee4a3..80104b5af32f 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -7901,7 +7901,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
 	if (sctx->proto >= 2) {
 		u32 send_buf_num_pages;
 
-		sctx->send_max_size = ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE);
+		sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V2;
 		sctx->send_buf = vmalloc(sctx->send_max_size);
 		if (!sctx->send_buf) {
 			ret = -ENOMEM;
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index f7585cfa7e52..4f5509cb1803 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -18,10 +18,12 @@
 #endif
 
 /*
- * In send stream v1, no command is larger than 64K. In send stream v2, no limit
- * should be assumed.
+ * In send stream v1, no command is larger than 64K. In send stream v2, no
+ * limit should be assumed, the buffer size is set to be a header with
+ * compressed extent size.
  */
 #define BTRFS_SEND_BUF_SIZE_V1				SZ_64K
+#define BTRFS_SEND_BUF_SIZE_V2	ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE)
 
 struct inode;
 struct btrfs_ioctl_send_args;
-- 
cgit 


From c7f13d428ea1bfe883f2741a9b5a5352d595eb09 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 19 Oct 2022 10:50:47 -0400
Subject: btrfs: move fs wide helpers out of ctree.h

We have several fs wide related helpers in ctree.h.  The bulk of these
are the incompat flag test helpers, but there are things such as
btrfs_fs_closing() and the read only helpers that also aren't directly
related to the ctree code.  Move these into a fs.h header, which will
serve as the location for file system wide related helpers.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/Makefile          |   2 +-
 fs/btrfs/backref.c         |   1 +
 fs/btrfs/block-group.c     |   1 +
 fs/btrfs/ctree.h           | 164 ---------------------------------------------
 fs/btrfs/disk-io.c         |   1 +
 fs/btrfs/extent-tree.c     |   1 +
 fs/btrfs/file-item.c       |   1 +
 fs/btrfs/file.c            |   1 +
 fs/btrfs/free-space-tree.c |   1 +
 fs/btrfs/fs.c              |  92 +++++++++++++++++++++++++
 fs/btrfs/fs.h              |  85 +++++++++++++++++++++++
 fs/btrfs/inode.c           |   1 +
 fs/btrfs/ioctl.c           |   1 +
 fs/btrfs/props.c           |   1 +
 fs/btrfs/qgroup.c          |   1 +
 fs/btrfs/relocation.c      |   1 +
 fs/btrfs/scrub.c           |   1 +
 fs/btrfs/space-info.c      |   1 +
 fs/btrfs/super.c           |   1 +
 fs/btrfs/transaction.c     |   1 +
 fs/btrfs/tree-checker.c    |   1 +
 fs/btrfs/tree-log.c        |   1 +
 fs/btrfs/uuid-tree.c       |   1 +
 fs/btrfs/verity.c          |   1 +
 fs/btrfs/volumes.c         |   1 +
 fs/btrfs/zoned.c           |   1 +
 26 files changed, 200 insertions(+), 165 deletions(-)
 create mode 100644 fs/btrfs/fs.c
 create mode 100644 fs/btrfs/fs.h

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index fa9ddcc9eb0b..eebb45c06485 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -31,7 +31,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
 	   uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
 	   block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
-	   subpage.o tree-mod-log.o extent-io-tree.o
+	   subpage.o tree-mod-log.o extent-io-tree.o fs.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 232f49415ab9..f76db317c437 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -15,6 +15,7 @@
 #include "locking.h"
 #include "misc.h"
 #include "tree-mod-log.h"
+#include "fs.h"
 
 /* Just arbitrary numbers so we can be sure one of these happened. */
 #define BACKREF_FOUND_SHARED     6
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 47461fd21975..bf8d1e110136 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -17,6 +17,7 @@
 #include "discard.h"
 #include "raid56.h"
 #include "zoned.h"
+#include "fs.h"
 
 #ifdef CONFIG_BTRFS_DEBUG
 int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3485f60efa33..1b2a1dcb60dd 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2857,44 +2857,6 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
 			struct extent_buffer *node,
 			struct extent_buffer *parent);
-static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
-{
-	/*
-	 * Do it this way so we only ever do one test_bit in the normal case.
-	 */
-	if (test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)) {
-		if (test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags))
-			return 2;
-		return 1;
-	}
-	return 0;
-}
-
-/*
- * If we remount the fs to be R/O or umount the fs, the cleaner needn't do
- * anything except sleeping. This function is used to check the status of
- * the fs.
- * We check for BTRFS_FS_STATE_RO to avoid races with a concurrent remount,
- * since setting and checking for SB_RDONLY in the superblock's flags is not
- * atomic.
- */
-static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info)
-{
-	return test_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state) ||
-		btrfs_fs_closing(fs_info);
-}
-
-static inline void btrfs_set_sb_rdonly(struct super_block *sb)
-{
-	sb->s_flags |= SB_RDONLY;
-	set_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state);
-}
-
-static inline void btrfs_clear_sb_rdonly(struct super_block *sb)
-{
-	sb->s_flags &= ~SB_RDONLY;
-	clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state);
-}
 
 /* root-item.c */
 int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
@@ -3529,132 +3491,6 @@ do {									\
 } while (0)
 
 
-/* compatibility and incompatibility defines */
-
-#define btrfs_set_fs_incompat(__fs_info, opt) \
-	__btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \
-				#opt)
-
-static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
-					   u64 flag, const char* name)
-{
-	struct btrfs_super_block *disk_super;
-	u64 features;
-
-	disk_super = fs_info->super_copy;
-	features = btrfs_super_incompat_flags(disk_super);
-	if (!(features & flag)) {
-		spin_lock(&fs_info->super_lock);
-		features = btrfs_super_incompat_flags(disk_super);
-		if (!(features & flag)) {
-			features |= flag;
-			btrfs_set_super_incompat_flags(disk_super, features);
-			btrfs_info(fs_info,
-				"setting incompat feature flag for %s (0x%llx)",
-				name, flag);
-		}
-		spin_unlock(&fs_info->super_lock);
-	}
-}
-
-#define btrfs_clear_fs_incompat(__fs_info, opt) \
-	__btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \
-				  #opt)
-
-static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info,
-					     u64 flag, const char* name)
-{
-	struct btrfs_super_block *disk_super;
-	u64 features;
-
-	disk_super = fs_info->super_copy;
-	features = btrfs_super_incompat_flags(disk_super);
-	if (features & flag) {
-		spin_lock(&fs_info->super_lock);
-		features = btrfs_super_incompat_flags(disk_super);
-		if (features & flag) {
-			features &= ~flag;
-			btrfs_set_super_incompat_flags(disk_super, features);
-			btrfs_info(fs_info,
-				"clearing incompat feature flag for %s (0x%llx)",
-				name, flag);
-		}
-		spin_unlock(&fs_info->super_lock);
-	}
-}
-
-#define btrfs_fs_incompat(fs_info, opt) \
-	__btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
-
-static inline bool __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag)
-{
-	struct btrfs_super_block *disk_super;
-	disk_super = fs_info->super_copy;
-	return !!(btrfs_super_incompat_flags(disk_super) & flag);
-}
-
-#define btrfs_set_fs_compat_ro(__fs_info, opt) \
-	__btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \
-				 #opt)
-
-static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info,
-					    u64 flag, const char *name)
-{
-	struct btrfs_super_block *disk_super;
-	u64 features;
-
-	disk_super = fs_info->super_copy;
-	features = btrfs_super_compat_ro_flags(disk_super);
-	if (!(features & flag)) {
-		spin_lock(&fs_info->super_lock);
-		features = btrfs_super_compat_ro_flags(disk_super);
-		if (!(features & flag)) {
-			features |= flag;
-			btrfs_set_super_compat_ro_flags(disk_super, features);
-			btrfs_info(fs_info,
-				"setting compat-ro feature flag for %s (0x%llx)",
-				name, flag);
-		}
-		spin_unlock(&fs_info->super_lock);
-	}
-}
-
-#define btrfs_clear_fs_compat_ro(__fs_info, opt) \
-	__btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \
-				   #opt)
-
-static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info,
-					      u64 flag, const char *name)
-{
-	struct btrfs_super_block *disk_super;
-	u64 features;
-
-	disk_super = fs_info->super_copy;
-	features = btrfs_super_compat_ro_flags(disk_super);
-	if (features & flag) {
-		spin_lock(&fs_info->super_lock);
-		features = btrfs_super_compat_ro_flags(disk_super);
-		if (features & flag) {
-			features &= ~flag;
-			btrfs_set_super_compat_ro_flags(disk_super, features);
-			btrfs_info(fs_info,
-				"clearing compat-ro feature flag for %s (0x%llx)",
-				name, flag);
-		}
-		spin_unlock(&fs_info->super_lock);
-	}
-}
-
-#define btrfs_fs_compat_ro(fs_info, opt) \
-	__btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
-
-static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag)
-{
-	struct btrfs_super_block *disk_super;
-	disk_super = fs_info->super_copy;
-	return !!(btrfs_super_compat_ro_flags(disk_super) & flag);
-}
-
 /* acl.c */
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
 struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index cf59d8f14a05..5e20b191b108 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -43,6 +43,7 @@
 #include "space-info.h"
 #include "zoned.h"
 #include "subpage.h"
+#include "fs.h"
 
 #define BTRFS_SUPER_FLAG_SUPP	(BTRFS_HEADER_FLAG_WRITTEN |\
 				 BTRFS_HEADER_FLAG_RELOC |\
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2801c991814f..bc010dbcb6b1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -36,6 +36,7 @@
 #include "rcu-string.h"
 #include "zoned.h"
 #include "dev-replace.h"
+#include "fs.h"
 
 #undef SCRAMBLE_DELAYED_REFS
 
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 6bb9fa961a6a..824ff54d8155 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -16,6 +16,7 @@
 #include "volumes.h"
 #include "print-tree.h"
 #include "compression.h"
+#include "fs.h"
 
 #define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
 				   sizeof(struct btrfs_item) * 2) / \
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 19b41b5fe6c0..a352c7cacc99 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -30,6 +30,7 @@
 #include "delalloc-space.h"
 #include "reflink.h"
 #include "subpage.h"
+#include "fs.h"
 
 static struct kmem_cache *btrfs_inode_defrag_cachep;
 /*
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 367bcfcf68f5..bfc21eb8ec63 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -11,6 +11,7 @@
 #include "free-space-tree.h"
 #include "transaction.h"
 #include "block-group.h"
+#include "fs.h"
 
 static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
 					struct btrfs_block_group *block_group,
diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c
new file mode 100644
index 000000000000..d4ba948eba56
--- /dev/null
+++ b/fs/btrfs/fs.c
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "ctree.h"
+#include "fs.h"
+
+void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
+			     const char *name)
+{
+	struct btrfs_super_block *disk_super;
+	u64 features;
+
+	disk_super = fs_info->super_copy;
+	features = btrfs_super_incompat_flags(disk_super);
+	if (!(features & flag)) {
+		spin_lock(&fs_info->super_lock);
+		features = btrfs_super_incompat_flags(disk_super);
+		if (!(features & flag)) {
+			features |= flag;
+			btrfs_set_super_incompat_flags(disk_super, features);
+			btrfs_info(fs_info,
+				"setting incompat feature flag for %s (0x%llx)",
+				name, flag);
+		}
+		spin_unlock(&fs_info->super_lock);
+	}
+}
+
+void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
+			       const char *name)
+{
+	struct btrfs_super_block *disk_super;
+	u64 features;
+
+	disk_super = fs_info->super_copy;
+	features = btrfs_super_incompat_flags(disk_super);
+	if (features & flag) {
+		spin_lock(&fs_info->super_lock);
+		features = btrfs_super_incompat_flags(disk_super);
+		if (features & flag) {
+			features &= ~flag;
+			btrfs_set_super_incompat_flags(disk_super, features);
+			btrfs_info(fs_info,
+				"clearing incompat feature flag for %s (0x%llx)",
+				name, flag);
+		}
+		spin_unlock(&fs_info->super_lock);
+	}
+}
+
+void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
+			      const char *name)
+{
+	struct btrfs_super_block *disk_super;
+	u64 features;
+
+	disk_super = fs_info->super_copy;
+	features = btrfs_super_compat_ro_flags(disk_super);
+	if (!(features & flag)) {
+		spin_lock(&fs_info->super_lock);
+		features = btrfs_super_compat_ro_flags(disk_super);
+		if (!(features & flag)) {
+			features |= flag;
+			btrfs_set_super_compat_ro_flags(disk_super, features);
+			btrfs_info(fs_info,
+				"setting compat-ro feature flag for %s (0x%llx)",
+				name, flag);
+		}
+		spin_unlock(&fs_info->super_lock);
+	}
+}
+
+void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
+				const char *name)
+{
+	struct btrfs_super_block *disk_super;
+	u64 features;
+
+	disk_super = fs_info->super_copy;
+	features = btrfs_super_compat_ro_flags(disk_super);
+	if (features & flag) {
+		spin_lock(&fs_info->super_lock);
+		features = btrfs_super_compat_ro_flags(disk_super);
+		if (features & flag) {
+			features &= ~flag;
+			btrfs_set_super_compat_ro_flags(disk_super, features);
+			btrfs_info(fs_info,
+				"clearing compat-ro feature flag for %s (0x%llx)",
+				name, flag);
+		}
+		spin_unlock(&fs_info->super_lock);
+	}
+}
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
new file mode 100644
index 000000000000..8eda9ce0a904
--- /dev/null
+++ b/fs/btrfs/fs.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_FS_H
+#define BTRFS_FS_H
+
+/* Compatibility and incompatibility defines */
+void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
+			     const char *name);
+void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
+			       const char *name);
+void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
+			      const char *name);
+void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
+				const char *name);
+
+#define btrfs_set_fs_incompat(__fs_info, opt)				\
+	__btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, #opt)
+
+#define btrfs_clear_fs_incompat(__fs_info, opt)				\
+	__btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, #opt)
+
+#define btrfs_fs_incompat(fs_info, opt)					\
+	__btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
+
+#define btrfs_set_fs_compat_ro(__fs_info, opt)				\
+	__btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, #opt)
+
+#define btrfs_clear_fs_compat_ro(__fs_info, opt)			\
+	__btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, #opt)
+
+#define btrfs_fs_compat_ro(fs_info, opt)				\
+	__btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
+
+static inline bool __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag)
+{
+	struct btrfs_super_block *disk_super;
+	disk_super = fs_info->super_copy;
+	return !!(btrfs_super_incompat_flags(disk_super) & flag);
+}
+
+static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag)
+{
+	struct btrfs_super_block *disk_super;
+	disk_super = fs_info->super_copy;
+	return !!(btrfs_super_compat_ro_flags(disk_super) & flag);
+}
+
+static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
+{
+	/* Do it this way so we only ever do one test_bit in the normal case. */
+	if (test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)) {
+		if (test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags))
+			return 2;
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * If we remount the fs to be R/O or umount the fs, the cleaner needn't do
+ * anything except sleeping. This function is used to check the status of
+ * the fs.
+ * We check for BTRFS_FS_STATE_RO to avoid races with a concurrent remount,
+ * since setting and checking for SB_RDONLY in the superblock's flags is not
+ * atomic.
+ */
+static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info)
+{
+	return test_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state) ||
+		btrfs_fs_closing(fs_info);
+}
+
+static inline void btrfs_set_sb_rdonly(struct super_block *sb)
+{
+	sb->s_flags |= SB_RDONLY;
+	set_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state);
+}
+
+static inline void btrfs_clear_sb_rdonly(struct super_block *sb)
+{
+	sb->s_flags &= ~SB_RDONLY;
+	clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state);
+}
+
+#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 50584b93a66f..60b3162c035d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -55,6 +55,7 @@
 #include "zoned.h"
 #include "subpage.h"
 #include "inode-item.h"
+#include "fs.h"
 
 struct btrfs_iget_args {
 	u64 ino;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a482739c1d82..564a4ae9c285 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -50,6 +50,7 @@
 #include "delalloc-space.h"
 #include "block-group.h"
 #include "subpage.h"
+#include "fs.h"
 
 #ifdef CONFIG_64BIT
 /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index e04289347775..d2c699c9aa51 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -11,6 +11,7 @@
 #include "xattr.h"
 #include "compression.h"
 #include "space-info.h"
+#include "fs.h"
 
 #define BTRFS_PROP_HANDLERS_HT_BITS 8
 static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index b74105a10f16..e87a2f066f4d 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -24,6 +24,7 @@
 #include "block-group.h"
 #include "sysfs.h"
 #include "tree-mod-log.h"
+#include "fs.h"
 
 /*
  * Helpers to access qgroup reservation
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 216a4485d914..977afbb4cd45 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -28,6 +28,7 @@
 #include "zoned.h"
 #include "inode-item.h"
 #include "space-info.h"
+#include "fs.h"
 
 /*
  * Relocation overview
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index cdda4b2f20f2..6871c1c54f8c 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -21,6 +21,7 @@
 #include "raid56.h"
 #include "block-group.h"
 #include "zoned.h"
+#include "fs.h"
 
 /*
  * This is only the first step towards a full-features scrub. It reads all
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index af2e133aaa5c..af1538c2685d 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -10,6 +10,7 @@
 #include "transaction.h"
 #include "block-group.h"
 #include "zoned.h"
+#include "fs.h"
 
 /*
  * HOW DOES SPACE RESERVATION WORK
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 42e0d2fcc407..ef646a1d9bc2 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -49,6 +49,7 @@
 #include "discard.h"
 #include "qgroup.h"
 #include "raid56.h"
+#include "fs.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/btrfs.h>
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index ae7d4aca771d..bae77fb05e2b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -23,6 +23,7 @@
 #include "block-group.h"
 #include "space-info.h"
 #include "zoned.h"
+#include "fs.h"
 
 static struct kmem_cache *btrfs_trans_handle_cachep;
 
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 43f905ab0a18..862d67798de5 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -25,6 +25,7 @@
 #include "volumes.h"
 #include "misc.h"
 #include "btrfs_inode.h"
+#include "fs.h"
 
 /*
  * Error message should follow the following format:
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c3cf3dabe0b1..e294c38f9b19 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -21,6 +21,7 @@
 #include "space-info.h"
 #include "zoned.h"
 #include "inode-item.h"
+#include "fs.h"
 
 #define MAX_CONFLICT_INODES 10
 
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index b458452a1aaf..2d7eb290fb9c 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -9,6 +9,7 @@
 #include "transaction.h"
 #include "disk-io.h"
 #include "print-tree.h"
+#include "fs.h"
 
 
 static void btrfs_uuid_to_key(u8 *uuid, u8 type, struct btrfs_key *key)
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index ee00e33c309e..ab0b39badbbe 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -15,6 +15,7 @@
 #include "transaction.h"
 #include "disk-io.h"
 #include "locking.h"
+#include "fs.h"
 
 /*
  * Implementation of the interface defined in struct fsverity_operations.
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 796e9f5ff8f8..d65d5d7835fe 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -33,6 +33,7 @@
 #include "block-group.h"
 #include "discard.h"
 #include "zoned.h"
+#include "fs.h"
 
 static struct bio_set btrfs_bioset;
 
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index c9e2b0c85309..2a7d856c232c 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -15,6 +15,7 @@
 #include "transaction.h"
 #include "dev-replace.h"
 #include "space-info.h"
+#include "fs.h"
 
 /* Maximum number of zones to report per blkdev_report_zones() call */
 #define BTRFS_REPORT_NR_ZONES   4096
-- 
cgit 


From e118578a8df7941a9bbc568851997852e5bc7338 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 19 Oct 2022 10:50:48 -0400
Subject: btrfs: move assert helpers out of ctree.h

These call functions that aren't defined in, or will be moved out of,
ctree.h  Move them to super.c where the other assert/error message code
is defined. Drop the __noreturn attribute for btrfs_assertfail as
objtool does not like it and fails with warnings like

  fs/btrfs/dir-item.o: warning: objtool: .text.unlikely: unexpected end of section
  fs/btrfs/xattr.o: warning: objtool: btrfs_setxattr() falls through to next function btrfs_setxattr_trans.cold()
  fs/btrfs/xattr.o: warning: objtool: .text.unlikely: unexpected end of section

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h | 18 +++---------------
 fs/btrfs/super.c | 14 ++++++++++++++
 2 files changed, 17 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1b2a1dcb60dd..e5e4079c92d4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3333,18 +3333,11 @@ do {								\
 } while (0)
 
 #ifdef CONFIG_BTRFS_ASSERT
-__cold __noreturn
-static inline void assertfail(const char *expr, const char *file, int line)
-{
-	pr_err("assertion failed: %s, in %s:%d\n", expr, file, line);
-	BUG();
-}
+void __cold btrfs_assertfail(const char *expr, const char *file, int line);
 
 #define ASSERT(expr)						\
-	(likely(expr) ? (void)0 : assertfail(#expr, __FILE__, __LINE__))
-
+	(likely(expr) ? (void)0 : btrfs_assertfail(#expr, __FILE__, __LINE__))
 #else
-static inline void assertfail(const char *expr, const char* file, int line) { }
 #define ASSERT(expr)	(void)(expr)
 #endif
 
@@ -3404,12 +3397,7 @@ static inline unsigned long get_eb_page_index(unsigned long offset)
 #define EXPORT_FOR_TESTS
 #endif
 
-__cold
-static inline void btrfs_print_v0_err(struct btrfs_fs_info *fs_info)
-{
-	btrfs_err(fs_info,
-"Unsupported V0 extent filesystem detected. Aborting. Please re-create your filesystem with a newer kernel");
-}
+void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info);
 
 __printf(5, 6)
 __cold
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index ef646a1d9bc2..06661bcb991b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -305,6 +305,20 @@ void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt,
 }
 #endif
 
+#ifdef CONFIG_BTRFS_ASSERT
+void __cold btrfs_assertfail(const char *expr, const char *file, int line)
+{
+	pr_err("assertion failed: %s, in %s:%d\n", expr, file, line);
+	BUG();
+}
+#endif
+
+void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info)
+{
+	btrfs_err(fs_info,
+"Unsupported V0 extent filesystem detected. Aborting. Please re-create your filesystem with a newer kernel");
+}
+
 #if BITS_PER_LONG == 32
 void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info)
 {
-- 
cgit 


From 9b569ea0be6fb27a4985acc9325896a3edc95ede Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 19 Oct 2022 10:50:49 -0400
Subject: btrfs: move the printk helpers out of ctree.h

We have a bunch of printk helpers that are in ctree.h.  These have
nothing to do with ctree.c, so move them into their own header.
Subsequent patches will cleanup the printk helpers.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.h          |   1 +
 fs/btrfs/check-integrity.c  |   1 +
 fs/btrfs/ctree.c            |   1 +
 fs/btrfs/ctree.h            | 249 ------------------------------------------
 fs/btrfs/delalloc-space.c   |   1 +
 fs/btrfs/delayed-inode.c    |   1 +
 fs/btrfs/delayed-ref.c      |   1 +
 fs/btrfs/dir-item.c         |   1 +
 fs/btrfs/extent-io-tree.c   |   1 +
 fs/btrfs/extent_map.c       |   1 +
 fs/btrfs/file-item.c        |   1 +
 fs/btrfs/free-space-cache.c |   1 +
 fs/btrfs/free-space-tree.c  |   1 +
 fs/btrfs/fs.c               |   1 +
 fs/btrfs/inode-item.c       |   1 +
 fs/btrfs/lzo.c              |   1 +
 fs/btrfs/messages.h         | 259 ++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/ordered-data.c     |   1 +
 fs/btrfs/print-tree.c       |   1 +
 fs/btrfs/props.c            |   1 +
 fs/btrfs/raid56.c           |   1 +
 fs/btrfs/ref-verify.c       |   1 +
 fs/btrfs/reflink.c          |   1 +
 fs/btrfs/root-tree.c        |   1 +
 fs/btrfs/struct-funcs.c     |   1 +
 fs/btrfs/subpage.c          |   1 +
 fs/btrfs/super.c            |   1 +
 fs/btrfs/sysfs.c            |   2 +-
 fs/btrfs/tree-checker.c     |   1 +
 fs/btrfs/tree-log.h         |   1 +
 fs/btrfs/tree-mod-log.c     |   1 +
 fs/btrfs/ulist.c            |   1 +
 fs/btrfs/uuid-tree.c        |   1 +
 fs/btrfs/verity.c           |   1 +
 fs/btrfs/xattr.c            |   1 +
 fs/btrfs/zoned.h            |   1 +
 36 files changed, 293 insertions(+), 250 deletions(-)
 create mode 100644 fs/btrfs/messages.h

(limited to 'fs')

diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 6dac462430b0..fa3c9cbf9ae7 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -7,6 +7,7 @@
 #define BTRFS_BACKREF_H
 
 #include <linux/btrfs.h>
+#include "messages.h"
 #include "ulist.h"
 #include "disk-io.h"
 #include "extent_io.h"
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 98c6e5feab19..e8e1a92b30ac 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -82,6 +82,7 @@
 #include <linux/mm.h>
 #include <linux/string.h>
 #include <crypto/hash.h>
+#include "messages.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index d645933ef135..f423b86f5262 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -8,6 +8,7 @@
 #include <linux/rbtree.h>
 #include <linux/mm.h>
 #include <linux/error-injection.h>
+#include "messages.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index e5e4079c92d4..d27d059e4d59 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3168,179 +3168,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait);
 char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
 					  u64 subvol_objectid);
 
-static inline __printf(2, 3) __cold
-void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
-{
-}
-
-#ifdef CONFIG_PRINTK_INDEX
-
-#define btrfs_printk(fs_info, fmt, args...)					\
-do {										\
-	printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt);	\
-	_btrfs_printk(fs_info, fmt, ##args);					\
-} while (0)
-
-__printf(2, 3)
-__cold
-void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
-
-#elif defined(CONFIG_PRINTK)
-
-#define btrfs_printk(fs_info, fmt, args...)				\
-	_btrfs_printk(fs_info, fmt, ##args)
-
-__printf(2, 3)
-__cold
-void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
-
-#else
-
-#define btrfs_printk(fs_info, fmt, args...) \
-	btrfs_no_printk(fs_info, fmt, ##args)
-#endif
-
-#define btrfs_emerg(fs_info, fmt, args...) \
-	btrfs_printk(fs_info, KERN_EMERG fmt, ##args)
-#define btrfs_alert(fs_info, fmt, args...) \
-	btrfs_printk(fs_info, KERN_ALERT fmt, ##args)
-#define btrfs_crit(fs_info, fmt, args...) \
-	btrfs_printk(fs_info, KERN_CRIT fmt, ##args)
-#define btrfs_err(fs_info, fmt, args...) \
-	btrfs_printk(fs_info, KERN_ERR fmt, ##args)
-#define btrfs_warn(fs_info, fmt, args...) \
-	btrfs_printk(fs_info, KERN_WARNING fmt, ##args)
-#define btrfs_notice(fs_info, fmt, args...) \
-	btrfs_printk(fs_info, KERN_NOTICE fmt, ##args)
-#define btrfs_info(fs_info, fmt, args...) \
-	btrfs_printk(fs_info, KERN_INFO fmt, ##args)
-
-/*
- * Wrappers that use printk_in_rcu
- */
-#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \
-	btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args)
-#define btrfs_alert_in_rcu(fs_info, fmt, args...) \
-	btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args)
-#define btrfs_crit_in_rcu(fs_info, fmt, args...) \
-	btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args)
-#define btrfs_err_in_rcu(fs_info, fmt, args...) \
-	btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args)
-#define btrfs_warn_in_rcu(fs_info, fmt, args...) \
-	btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args)
-#define btrfs_notice_in_rcu(fs_info, fmt, args...) \
-	btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
-#define btrfs_info_in_rcu(fs_info, fmt, args...) \
-	btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args)
-
-/*
- * Wrappers that use a ratelimited printk_in_rcu
- */
-#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \
-	btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args)
-#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \
-	btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args)
-#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \
-	btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args)
-#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \
-	btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args)
-#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \
-	btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args)
-#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \
-	btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
-#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \
-	btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args)
-
-/*
- * Wrappers that use a ratelimited printk
- */
-#define btrfs_emerg_rl(fs_info, fmt, args...) \
-	btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args)
-#define btrfs_alert_rl(fs_info, fmt, args...) \
-	btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args)
-#define btrfs_crit_rl(fs_info, fmt, args...) \
-	btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args)
-#define btrfs_err_rl(fs_info, fmt, args...) \
-	btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args)
-#define btrfs_warn_rl(fs_info, fmt, args...) \
-	btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args)
-#define btrfs_notice_rl(fs_info, fmt, args...) \
-	btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args)
-#define btrfs_info_rl(fs_info, fmt, args...) \
-	btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args)
-
-#if defined(CONFIG_DYNAMIC_DEBUG)
-#define btrfs_debug(fs_info, fmt, args...)				\
-	_dynamic_func_call_no_desc(fmt, btrfs_printk,			\
-				   fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_in_rcu(fs_info, fmt, args...)			\
-	_dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu,		\
-				   fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...)			\
-	_dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu,		\
-				   fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_rl(fs_info, fmt, args...)				\
-	_dynamic_func_call_no_desc(fmt, btrfs_printk_ratelimited,	\
-				   fs_info, KERN_DEBUG fmt, ##args)
-#elif defined(DEBUG)
-#define btrfs_debug(fs_info, fmt, args...) \
-	btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
-	btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
-	btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_rl(fs_info, fmt, args...) \
-	btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args)
-#else
-#define btrfs_debug(fs_info, fmt, args...) \
-	btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
-	btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
-	btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_rl(fs_info, fmt, args...) \
-	btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args)
-#endif
-
-#define btrfs_printk_in_rcu(fs_info, fmt, args...)	\
-do {							\
-	rcu_read_lock();				\
-	btrfs_printk(fs_info, fmt, ##args);		\
-	rcu_read_unlock();				\
-} while (0)
-
-#define btrfs_no_printk_in_rcu(fs_info, fmt, args...)	\
-do {							\
-	rcu_read_lock();				\
-	btrfs_no_printk(fs_info, fmt, ##args);		\
-	rcu_read_unlock();				\
-} while (0)
-
-#define btrfs_printk_ratelimited(fs_info, fmt, args...)		\
-do {								\
-	static DEFINE_RATELIMIT_STATE(_rs,			\
-		DEFAULT_RATELIMIT_INTERVAL,			\
-		DEFAULT_RATELIMIT_BURST);       		\
-	if (__ratelimit(&_rs))					\
-		btrfs_printk(fs_info, fmt, ##args);		\
-} while (0)
-
-#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...)		\
-do {								\
-	rcu_read_lock();					\
-	btrfs_printk_ratelimited(fs_info, fmt, ##args);		\
-	rcu_read_unlock();					\
-} while (0)
-
-#ifdef CONFIG_BTRFS_ASSERT
-void __cold btrfs_assertfail(const char *expr, const char *file, int line);
-
-#define ASSERT(expr)						\
-	(likely(expr) ? (void)0 : btrfs_assertfail(#expr, __FILE__, __LINE__))
-#else
-#define ASSERT(expr)	(void)(expr)
-#endif
-
 #if BITS_PER_LONG == 32
 #define BTRFS_32BIT_MAX_FILE_SIZE (((u64)ULONG_MAX + 1) << PAGE_SHIFT)
 /*
@@ -3397,88 +3224,12 @@ static inline unsigned long get_eb_page_index(unsigned long offset)
 #define EXPORT_FOR_TESTS
 #endif
 
-void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info);
-
-__printf(5, 6)
-__cold
-void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
-		     unsigned int line, int errno, const char *fmt, ...);
-
-const char * __attribute_const__ btrfs_decode_error(int errno);
-
-__cold
-void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
-			       const char *function,
-			       unsigned int line, int errno, bool first_hit);
-
-bool __cold abort_should_print_stack(int errno);
-
-/*
- * Call btrfs_abort_transaction as early as possible when an error condition is
- * detected, that way the exact stack trace is reported for some errors.
- */
-#define btrfs_abort_transaction(trans, errno)		\
-do {								\
-	bool first = false;					\
-	/* Report first abort since mount */			\
-	if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,	\
-			&((trans)->fs_info->fs_state))) {	\
-		first = true;					\
-		if (WARN(abort_should_print_stack(errno), 	\
-			KERN_DEBUG				\
-			"BTRFS: Transaction aborted (error %d)\n",	\
-			(errno))) {					\
-			/* Stack trace printed. */			\
-		} else {						\
-			btrfs_debug((trans)->fs_info,			\
-				    "Transaction aborted (error %d)", \
-				  (errno));			\
-		}						\
-	}							\
-	__btrfs_abort_transaction((trans), __func__,		\
-				  __LINE__, (errno), first);	\
-} while (0)
-
-#ifdef CONFIG_PRINTK_INDEX
-
-#define btrfs_handle_fs_error(fs_info, errno, fmt, args...)		\
-do {									\
-	printk_index_subsys_emit(					\
-		"BTRFS: error (device %s%s) in %s:%d: errno=%d %s",	\
-		KERN_CRIT, fmt);					\
-	__btrfs_handle_fs_error((fs_info), __func__, __LINE__,		\
-				(errno), fmt, ##args);			\
-} while (0)
-
-#else
-
-#define btrfs_handle_fs_error(fs_info, errno, fmt, args...)		\
-	__btrfs_handle_fs_error((fs_info), __func__, __LINE__,		\
-				(errno), fmt, ##args)
-
-#endif
-
 #define BTRFS_FS_ERROR(fs_info)	(unlikely(test_bit(BTRFS_FS_STATE_ERROR, \
 						   &(fs_info)->fs_state)))
 #define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info)				\
 	(unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR,		\
 			   &(fs_info)->fs_state)))
 
-__printf(5, 6)
-__cold
-void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
-		   unsigned int line, int errno, const char *fmt, ...);
-/*
- * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
- * will panic().  Otherwise we BUG() here.
- */
-#define btrfs_panic(fs_info, errno, fmt, args...)			\
-do {									\
-	__btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args);	\
-	BUG();								\
-} while (0)
-
-
 /* acl.c */
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
 struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu);
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 118b2e20b2e1..045545145a2b 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 
+#include "messages.h"
 #include "ctree.h"
 #include "delalloc-space.h"
 #include "block-rsv.h"
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index a411f04a7b97..8cf5ee646147 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -6,6 +6,7 @@
 
 #include <linux/slab.h>
 #include <linux/iversion.h>
+#include "messages.h"
 #include "misc.h"
 #include "delayed-inode.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 36a3debe9493..c775ff4f1cb1 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -6,6 +6,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/sort.h>
+#include "messages.h"
 #include "ctree.h"
 #include "delayed-ref.h"
 #include "transaction.h"
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 72fb2c518a2b..be5c1c2a8da5 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -3,6 +3,7 @@
  * Copyright (C) 2007 Oracle.  All rights reserved.
  */
 
+#include "messages.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index 599db7b15574..2cdff74ff033 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -2,6 +2,7 @@
 
 #include <linux/slab.h>
 #include <trace/events/btrfs.h>
+#include "messages.h"
 #include "ctree.h"
 #include "extent-io-tree.h"
 #include "btrfs_inode.h"
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 715979807ae1..f97508afb659 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -3,6 +3,7 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
+#include "messages.h"
 #include "ctree.h"
 #include "volumes.h"
 #include "extent_map.h"
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 824ff54d8155..675987e2d652 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -9,6 +9,7 @@
 #include <linux/highmem.h>
 #include <linux/sched/mm.h>
 #include <crypto/hash.h>
+#include "messages.h"
 #include "misc.h"
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index bd68fafcccc7..83d866f5ab6c 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -11,6 +11,7 @@
 #include <linux/ratelimit.h>
 #include <linux/error-injection.h>
 #include <linux/sched/mm.h>
+#include "messages.h"
 #include "misc.h"
 #include "ctree.h"
 #include "free-space-cache.h"
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index bfc21eb8ec63..026214d74a02 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -5,6 +5,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched/mm.h>
+#include "messages.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "locking.h"
diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c
index d4ba948eba56..a59504b59435 100644
--- a/fs/btrfs/fs.c
+++ b/fs/btrfs/fs.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 
+#include "messages.h"
 #include "ctree.h"
 #include "fs.h"
 
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 366f3a788c6a..b301d8e3df87 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -3,6 +3,7 @@
  * Copyright (C) 2007 Oracle.  All rights reserved.
  */
 
+#include "messages.h"
 #include "ctree.h"
 #include "inode-item.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 89bc5f825e0a..6751874a3e69 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -13,6 +13,7 @@
 #include <linux/bio.h>
 #include <linux/lzo.h>
 #include <linux/refcount.h>
+#include "messages.h"
 #include "compression.h"
 #include "ctree.h"
 
diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h
new file mode 100644
index 000000000000..ace5bb02820a
--- /dev/null
+++ b/fs/btrfs/messages.h
@@ -0,0 +1,259 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_MESSAGES_H
+#define BTRFS_MESSAGES_H
+
+#include <linux/types.h>
+
+struct btrfs_fs_info;
+struct btrfs_trans_handle;
+
+static inline __printf(2, 3) __cold
+void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
+{
+}
+
+#ifdef CONFIG_PRINTK_INDEX
+
+#define btrfs_printk(fs_info, fmt, args...)					\
+do {										\
+	printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt);	\
+	_btrfs_printk(fs_info, fmt, ##args);					\
+} while (0)
+
+__printf(2, 3)
+__cold
+void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
+
+#elif defined(CONFIG_PRINTK)
+
+#define btrfs_printk(fs_info, fmt, args...)				\
+	_btrfs_printk(fs_info, fmt, ##args)
+
+__printf(2, 3)
+__cold
+void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
+
+#else
+
+#define btrfs_printk(fs_info, fmt, args...) \
+	btrfs_no_printk(fs_info, fmt, ##args)
+#endif
+
+#define btrfs_emerg(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_INFO fmt, ##args)
+
+/*
+ * Wrappers that use printk_in_rcu
+ */
+#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args)
+
+/*
+ * Wrappers that use a ratelimited printk_in_rcu
+ */
+#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args)
+
+/*
+ * Wrappers that use a ratelimited printk
+ */
+#define btrfs_emerg_rl(fs_info, fmt, args...) \
+	btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert_rl(fs_info, fmt, args...) \
+	btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit_rl(fs_info, fmt, args...) \
+	btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err_rl(fs_info, fmt, args...) \
+	btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn_rl(fs_info, fmt, args...) \
+	btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice_rl(fs_info, fmt, args...) \
+	btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info_rl(fs_info, fmt, args...) \
+	btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args)
+
+#if defined(CONFIG_DYNAMIC_DEBUG)
+#define btrfs_debug(fs_info, fmt, args...)				\
+	_dynamic_func_call_no_desc(fmt, btrfs_printk,			\
+				   fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_in_rcu(fs_info, fmt, args...)			\
+	_dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu,		\
+				   fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...)			\
+	_dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu,		\
+				   fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl(fs_info, fmt, args...)				\
+	_dynamic_func_call_no_desc(fmt, btrfs_printk_ratelimited,	\
+				   fs_info, KERN_DEBUG fmt, ##args)
+#elif defined(DEBUG)
+#define btrfs_debug(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
+	btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl(fs_info, fmt, args...) \
+	btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args)
+#else
+#define btrfs_debug(fs_info, fmt, args...) \
+	btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
+	btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
+	btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl(fs_info, fmt, args...) \
+	btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args)
+#endif
+
+#define btrfs_printk_in_rcu(fs_info, fmt, args...)	\
+do {							\
+	rcu_read_lock();				\
+	btrfs_printk(fs_info, fmt, ##args);		\
+	rcu_read_unlock();				\
+} while (0)
+
+#define btrfs_no_printk_in_rcu(fs_info, fmt, args...)	\
+do {							\
+	rcu_read_lock();				\
+	btrfs_no_printk(fs_info, fmt, ##args);		\
+	rcu_read_unlock();				\
+} while (0)
+
+#define btrfs_printk_ratelimited(fs_info, fmt, args...)		\
+do {								\
+	static DEFINE_RATELIMIT_STATE(_rs,			\
+		DEFAULT_RATELIMIT_INTERVAL,			\
+		DEFAULT_RATELIMIT_BURST);			\
+	if (__ratelimit(&_rs))					\
+		btrfs_printk(fs_info, fmt, ##args);		\
+} while (0)
+
+#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...)		\
+do {								\
+	rcu_read_lock();					\
+	btrfs_printk_ratelimited(fs_info, fmt, ##args);		\
+	rcu_read_unlock();					\
+} while (0)
+
+#ifdef CONFIG_BTRFS_ASSERT
+void __cold btrfs_assertfail(const char *expr, const char *file, int line);
+
+#define ASSERT(expr)						\
+	(likely(expr) ? (void)0 : btrfs_assertfail(#expr, __FILE__, __LINE__))
+#else
+#define ASSERT(expr)	(void)(expr)
+#endif
+
+void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info);
+
+__printf(5, 6)
+__cold
+void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
+		     unsigned int line, int errno, const char *fmt, ...);
+
+const char * __attribute_const__ btrfs_decode_error(int errno);
+
+__cold
+void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
+			       const char *function,
+			       unsigned int line, int errno, bool first_hit);
+
+bool __cold abort_should_print_stack(int errno);
+
+/*
+ * Call btrfs_abort_transaction as early as possible when an error condition is
+ * detected, that way the exact stack trace is reported for some errors.
+ */
+#define btrfs_abort_transaction(trans, errno)			\
+do {								\
+	bool first = false;					\
+	/* Report first abort since mount */			\
+	if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,	\
+			      &((trans)->fs_info->fs_state))) {	\
+		first = true;					\
+		if (WARN(abort_should_print_stack(errno),       \
+			KERN_DEBUG				\
+			"BTRFS: Transaction aborted (error %d)\n",	\
+			(errno))) {					\
+			/* Stack trace printed. */			\
+		} else {						\
+			btrfs_debug((trans)->fs_info,			\
+				    "Transaction aborted (error %d)", \
+				  (errno));			\
+		}						\
+	}							\
+	__btrfs_abort_transaction((trans), __func__,		\
+				  __LINE__, (errno), first);	\
+} while (0)
+
+#ifdef CONFIG_PRINTK_INDEX
+
+#define btrfs_handle_fs_error(fs_info, errno, fmt, args...)		\
+do {									\
+	printk_index_subsys_emit(					\
+		"BTRFS: error (device %s%s) in %s:%d: errno=%d %s",	\
+		KERN_CRIT, fmt);					\
+	__btrfs_handle_fs_error((fs_info), __func__, __LINE__,		\
+				(errno), fmt, ##args);			\
+} while (0)
+
+#else
+
+#define btrfs_handle_fs_error(fs_info, errno, fmt, args...)		\
+	__btrfs_handle_fs_error((fs_info), __func__, __LINE__,		\
+				(errno), fmt, ##args)
+
+#endif
+
+__printf(5, 6)
+__cold
+void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+		   unsigned int line, int errno, const char *fmt, ...);
+/*
+ * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
+ * will panic().  Otherwise we BUG() here.
+ */
+#define btrfs_panic(fs_info, errno, fmt, args...)			\
+do {									\
+	__btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args);	\
+	BUG();								\
+} while (0)
+
+#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index de2b716d3e7b..1cbaacdc50da 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -7,6 +7,7 @@
 #include <linux/blkdev.h>
 #include <linux/writeback.h>
 #include <linux/sched/mm.h>
+#include "messages.h"
 #include "misc.h"
 #include "ctree.h"
 #include "transaction.h"
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index dd8777872143..708facaede2c 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -3,6 +3,7 @@
  * Copyright (C) 2007 Oracle.  All rights reserved.
  */
 
+#include "messages.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "print-tree.h"
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index d2c699c9aa51..6d88bf0a9b17 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -4,6 +4,7 @@
  */
 
 #include <linux/hashtable.h>
+#include "messages.h"
 #include "props.h"
 #include "btrfs_inode.h"
 #include "transaction.h"
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index c009c0a2081e..349270537c2e 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -13,6 +13,7 @@
 #include <linux/list_sort.h>
 #include <linux/raid/xor.h>
 #include <linux/mm.h>
+#include "messages.h"
 #include "misc.h"
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index a248f46cfe72..f7535b8b62f5 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -5,6 +5,7 @@
 
 #include <linux/sched.h>
 #include <linux/stacktrace.h>
+#include "messages.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "locking.h"
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index f50586ff85c8..6179864de6e7 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -2,6 +2,7 @@
 
 #include <linux/blkdev.h>
 #include <linux/iversion.h>
+#include "messages.h"
 #include "compression.h"
 #include "ctree.h"
 #include "delalloc-space.h"
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index e1f599d7a916..44c8c8ad0a16 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -5,6 +5,7 @@
 
 #include <linux/err.h>
 #include <linux/uuid.h>
+#include "messages.h"
 #include "ctree.h"
 #include "transaction.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index 12455b2b41de..6ba16c018d7f 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -5,6 +5,7 @@
 
 #include <asm/unaligned.h>
 
+#include "messages.h"
 #include "ctree.h"
 
 static bool check_setget_bounds(const struct extent_buffer *eb,
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 9a176af847d7..dd46b978ac2c 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include <linux/slab.h>
+#include "messages.h"
 #include "ctree.h"
 #include "subpage.h"
 #include "btrfs_inode.h"
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 06661bcb991b..7debbcfc5d47 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -26,6 +26,7 @@
 #include <linux/ratelimit.h>
 #include <linux/crc32c.h>
 #include <linux/btrfs.h>
+#include "messages.h"
 #include "delayed-inode.h"
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 910702df7767..0b6bea00742c 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -10,7 +10,7 @@
 #include <linux/completion.h>
 #include <linux/bug.h>
 #include <crypto/hash.h>
-
+#include "messages.h"
 #include "ctree.h"
 #include "discard.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 862d67798de5..fa9536110d69 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -18,6 +18,7 @@
 #include <linux/types.h>
 #include <linux/stddef.h>
 #include <linux/error-injection.h>
+#include "messages.h"
 #include "ctree.h"
 #include "tree-checker.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index aed1e05e9879..f5770829d075 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -6,6 +6,7 @@
 #ifndef BTRFS_TREE_LOG_H
 #define BTRFS_TREE_LOG_H
 
+#include "messages.h"
 #include "ctree.h"
 #include "transaction.h"
 
diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c
index 18b4699bcb11..41e5d9f789dc 100644
--- a/fs/btrfs/tree-mod-log.c
+++ b/fs/btrfs/tree-mod-log.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 
+#include "messages.h"
 #include "tree-mod-log.h"
 #include "disk-io.h"
 
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 3374c9e9be67..f2f20c8d84aa 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -5,6 +5,7 @@
  */
 
 #include <linux/slab.h>
+#include "messages.h"
 #include "ulist.h"
 #include "ctree.h"
 
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index 2d7eb290fb9c..190f752a2e10 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -5,6 +5,7 @@
 
 #include <linux/uuid.h>
 #include <asm/unaligned.h>
+#include "messages.h"
 #include "ctree.h"
 #include "transaction.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index ab0b39badbbe..35445855df4d 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -10,6 +10,7 @@
 #include <linux/iversion.h>
 #include <linux/fsverity.h>
 #include <linux/sched/mm.h>
+#include "messages.h"
 #include "ctree.h"
 #include "btrfs_inode.h"
 #include "transaction.h"
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 5bb8d8c86311..d12903f01f83 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -12,6 +12,7 @@
 #include <linux/posix_acl_xattr.h>
 #include <linux/iversion.h>
 #include <linux/sched/mm.h>
+#include "messages.h"
 #include "ctree.h"
 #include "btrfs_inode.h"
 #include "transaction.h"
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index 8bd16d40b7c6..f43990985d80 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -5,6 +5,7 @@
 
 #include <linux/types.h>
 #include <linux/blkdev.h>
+#include "messages.h"
 #include "volumes.h"
 #include "disk-io.h"
 #include "block-group.h"
-- 
cgit 


From bbde07a40a13cc5a3483eaeb52e9bb3b61d2cf57 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 19 Oct 2022 10:50:50 -0400
Subject: btrfs: push printk index code into their respective helpers

The printk index work can be pushed into the printk helpers themselves,
this allows us to further sanitize messages.h, removing the last
include in the header itself.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/messages.h | 29 +----------------------------
 fs/btrfs/super.c    | 10 ++++++++++
 2 files changed, 11 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h
index ace5bb02820a..3772358f8d30 100644
--- a/fs/btrfs/messages.h
+++ b/fs/btrfs/messages.h
@@ -13,19 +13,7 @@ void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
 {
 }
 
-#ifdef CONFIG_PRINTK_INDEX
-
-#define btrfs_printk(fs_info, fmt, args...)					\
-do {										\
-	printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt);	\
-	_btrfs_printk(fs_info, fmt, ##args);					\
-} while (0)
-
-__printf(2, 3)
-__cold
-void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
-
-#elif defined(CONFIG_PRINTK)
+#ifdef CONFIG_PRINTK
 
 #define btrfs_printk(fs_info, fmt, args...)				\
 	_btrfs_printk(fs_info, fmt, ##args)
@@ -223,25 +211,10 @@ do {								\
 				  __LINE__, (errno), first);	\
 } while (0)
 
-#ifdef CONFIG_PRINTK_INDEX
-
-#define btrfs_handle_fs_error(fs_info, errno, fmt, args...)		\
-do {									\
-	printk_index_subsys_emit(					\
-		"BTRFS: error (device %s%s) in %s:%d: errno=%d %s",	\
-		KERN_CRIT, fmt);					\
-	__btrfs_handle_fs_error((fs_info), __func__, __LINE__,		\
-				(errno), fmt, ##args);			\
-} while (0)
-
-#else
-
 #define btrfs_handle_fs_error(fs_info, errno, fmt, args...)		\
 	__btrfs_handle_fs_error((fs_info), __func__, __LINE__,		\
 				(errno), fmt, ##args)
 
-#endif
-
 __printf(5, 6)
 __cold
 void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 7debbcfc5d47..3aeb2106e119 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -181,6 +181,12 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
 	const char *errstr;
 #endif
 
+#ifdef CONFIG_PRINTK_INDEX
+	printk_index_subsys_emit(
+		"BTRFS: error (device %s%s) in %s:%d: errno=%d %s",
+		KERN_CRIT, fmt);
+#endif
+
 	/*
 	 * Special case: if the error is EROFS, and we're already
 	 * under SB_RDONLY, then it is safe here.
@@ -273,6 +279,10 @@ void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt,
 	const char *type = logtypes[4];
 	struct ratelimit_state *ratelimit = &printk_limits[4];
 
+#ifdef CONFIG_PRINTK_INDEX
+	printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt);
+#endif
+
 	va_start(args, fmt);
 
 	while ((kern_level = printk_get_level(fmt)) != 0) {
-- 
cgit 


From ec8eb376e271ed2b8724bf488f4c74d2746d5446 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 19 Oct 2022 10:50:51 -0400
Subject: btrfs: move BTRFS_FS_STATE* definitions and helpers to fs.h

We're going to use fs.h to hold fs wide related helpers and definitions,
move the FS_STATE enum and related helpers to fs.h, and then update all
files that need these definitions to include fs.h.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c       |  1 +
 fs/btrfs/ctree.c             |  1 +
 fs/btrfs/ctree.h             | 46 -----------------------------------------
 fs/btrfs/delalloc-space.c    |  1 +
 fs/btrfs/delayed-inode.c     |  3 ++-
 fs/btrfs/dev-replace.c       |  1 +
 fs/btrfs/extent_io.c         |  1 +
 fs/btrfs/free-space-cache.c  |  3 ++-
 fs/btrfs/fs.h                | 49 ++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/inode-item.c        |  3 ++-
 fs/btrfs/reflink.c           |  3 ++-
 fs/btrfs/root-tree.c         |  3 ++-
 fs/btrfs/sysfs.c             |  1 +
 fs/btrfs/tests/btrfs-tests.c |  1 +
 fs/btrfs/xattr.c             |  3 ++-
 15 files changed, 68 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 65a4e5087215..c0615af0434f 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -23,6 +23,7 @@
 #include <crypto/hash.h>
 #include "misc.h"
 #include "ctree.h"
+#include "fs.h"
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index f423b86f5262..7ecb658500ce 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -18,6 +18,7 @@
 #include "qgroup.h"
 #include "tree-mod-log.h"
 #include "tree-checker.h"
+#include "fs.h"
 
 static struct kmem_cache *btrfs_path_cachep;
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d27d059e4d59..6f1eefbe3691 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -67,37 +67,6 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
 		sizeof(struct btrfs_stripe) * (num_stripes - 1);
 }
 
-/*
- * Runtime (in-memory) states of filesystem
- */
-enum {
-	/* Global indicator of serious filesystem errors */
-	BTRFS_FS_STATE_ERROR,
-	/*
-	 * Filesystem is being remounted, allow to skip some operations, like
-	 * defrag
-	 */
-	BTRFS_FS_STATE_REMOUNTING,
-	/* Filesystem in RO mode */
-	BTRFS_FS_STATE_RO,
-	/* Track if a transaction abort has been reported on this filesystem */
-	BTRFS_FS_STATE_TRANS_ABORTED,
-	/*
-	 * Bio operations should be blocked on this filesystem because a source
-	 * or target device is being destroyed as part of a device replace
-	 */
-	BTRFS_FS_STATE_DEV_REPLACING,
-	/* The btrfs_fs_info created for self-tests */
-	BTRFS_FS_STATE_DUMMY_FS_INFO,
-
-	BTRFS_FS_STATE_NO_CSUMS,
-
-	/* Indicates there was an error cleaning up a log tree. */
-	BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
-
-	BTRFS_FS_STATE_COUNT
-};
-
 #define BTRFS_SUPER_INFO_OFFSET			SZ_64K
 #define BTRFS_SUPER_INFO_SIZE			4096
 static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
@@ -3224,12 +3193,6 @@ static inline unsigned long get_eb_page_index(unsigned long offset)
 #define EXPORT_FOR_TESTS
 #endif
 
-#define BTRFS_FS_ERROR(fs_info)	(unlikely(test_bit(BTRFS_FS_STATE_ERROR, \
-						   &(fs_info)->fs_state)))
-#define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info)				\
-	(unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR,		\
-			   &(fs_info)->fs_state)))
-
 /* acl.c */
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
 struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu);
@@ -3327,15 +3290,6 @@ static inline int btrfs_get_verity_descriptor(struct inode *inode, void *buf,
 /* Sanity test specific functions */
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 void btrfs_test_destroy_inode(struct inode *inode);
-static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
-{
-	return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
-}
-#else
-static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
-{
-	return 0;
-}
 #endif
 
 static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 045545145a2b..605d8874a446 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -9,6 +9,7 @@
 #include "transaction.h"
 #include "qgroup.h"
 #include "block-group.h"
+#include "fs.h"
 
 /*
  * HOW DOES THIS WORK
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 8cf5ee646147..2f68570fbb53 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -6,12 +6,13 @@
 
 #include <linux/slab.h>
 #include <linux/iversion.h>
+#include "ctree.h"
+#include "fs.h"
 #include "messages.h"
 #include "misc.h"
 #include "delayed-inode.h"
 #include "disk-io.h"
 #include "transaction.h"
-#include "ctree.h"
 #include "qgroup.h"
 #include "locking.h"
 #include "inode-item.h"
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 61e58066b5fd..348aef453e69 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -23,6 +23,7 @@
 #include "sysfs.h"
 #include "zoned.h"
 #include "block-group.h"
+#include "fs.h"
 
 /*
  * Device replace overview
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 78d7ea10621d..aa2d60f683bb 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -30,6 +30,7 @@
 #include "zoned.h"
 #include "block-group.h"
 #include "compression.h"
+#include "fs.h"
 
 static struct kmem_cache *extent_buffer_cache;
 
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 83d866f5ab6c..703902156f97 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -11,9 +11,10 @@
 #include <linux/ratelimit.h>
 #include <linux/error-injection.h>
 #include <linux/sched/mm.h>
+#include "ctree.h"
+#include "fs.h"
 #include "messages.h"
 #include "misc.h"
-#include "ctree.h"
 #include "free-space-cache.h"
 #include "transaction.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 8eda9ce0a904..25487af14717 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -3,6 +3,37 @@
 #ifndef BTRFS_FS_H
 #define BTRFS_FS_H
 
+/*
+ * Runtime (in-memory) states of filesystem
+ */
+enum {
+	/* Global indicator of serious filesystem errors */
+	BTRFS_FS_STATE_ERROR,
+	/*
+	 * Filesystem is being remounted, allow to skip some operations, like
+	 * defrag
+	 */
+	BTRFS_FS_STATE_REMOUNTING,
+	/* Filesystem in RO mode */
+	BTRFS_FS_STATE_RO,
+	/* Track if a transaction abort has been reported on this filesystem */
+	BTRFS_FS_STATE_TRANS_ABORTED,
+	/*
+	 * Bio operations should be blocked on this filesystem because a source
+	 * or target device is being destroyed as part of a device replace
+	 */
+	BTRFS_FS_STATE_DEV_REPLACING,
+	/* The btrfs_fs_info created for self-tests */
+	BTRFS_FS_STATE_DUMMY_FS_INFO,
+
+	BTRFS_FS_STATE_NO_CSUMS,
+
+	/* Indicates there was an error cleaning up a log tree. */
+	BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
+
+	BTRFS_FS_STATE_COUNT
+};
+
 /* Compatibility and incompatibility defines */
 void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
 			     const char *name);
@@ -82,4 +113,22 @@ static inline void btrfs_clear_sb_rdonly(struct super_block *sb)
 	clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state);
 }
 
+#define BTRFS_FS_ERROR(fs_info)	(unlikely(test_bit(BTRFS_FS_STATE_ERROR, \
+						   &(fs_info)->fs_state)))
+#define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info)				\
+	(unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR,		\
+			   &(fs_info)->fs_state)))
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
+{
+	return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
+}
+#else
+static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
+{
+	return 0;
+}
+#endif
+
 #endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index b301d8e3df87..25e9f1d65067 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -3,8 +3,9 @@
  * Copyright (C) 2007 Oracle.  All rights reserved.
  */
 
-#include "messages.h"
 #include "ctree.h"
+#include "fs.h"
+#include "messages.h"
 #include "inode-item.h"
 #include "disk-io.h"
 #include "transaction.h"
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 6179864de6e7..daf65bfad30e 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -2,9 +2,10 @@
 
 #include <linux/blkdev.h>
 #include <linux/iversion.h>
+#include "ctree.h"
+#include "fs.h"
 #include "messages.h"
 #include "compression.h"
-#include "ctree.h"
 #include "delalloc-space.h"
 #include "disk-io.h"
 #include "reflink.h"
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 44c8c8ad0a16..112b4bf3c3b8 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -5,8 +5,9 @@
 
 #include <linux/err.h>
 #include <linux/uuid.h>
-#include "messages.h"
 #include "ctree.h"
+#include "fs.h"
+#include "messages.h"
 #include "transaction.h"
 #include "disk-io.h"
 #include "print-tree.h"
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 0b6bea00742c..31fb6cb389b1 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -22,6 +22,7 @@
 #include "block-group.h"
 #include "qgroup.h"
 #include "misc.h"
+#include "fs.h"
 
 /*
  * Structure name                       Path
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index d43cb5242fec..669fa7133a2f 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -16,6 +16,7 @@
 #include "../disk-io.h"
 #include "../qgroup.h"
 #include "../block-group.h"
+#include "../fs.h"
 
 static struct vfsmount *test_mnt = NULL;
 
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index d12903f01f83..b26c869f0226 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -12,8 +12,9 @@
 #include <linux/posix_acl_xattr.h>
 #include <linux/iversion.h>
 #include <linux/sched/mm.h>
-#include "messages.h"
 #include "ctree.h"
+#include "fs.h"
+#include "messages.h"
 #include "btrfs_inode.h"
 #include "transaction.h"
 #include "xattr.h"
-- 
cgit 


From 0d3a9cf8c3068136145b90841778e3f1ac13f22a Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 19 Oct 2022 10:50:52 -0400
Subject: btrfs: convert incompat and compat flag test helpers to macros

These helpers use functions not defined in fs.h, they're simply
accessors of the super block in fs_info, convert them to macros so
that we don't have a weird dependency between fs.h and accessors.h.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/fs.h | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 25487af14717..682542ed964f 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -44,6 +44,12 @@ void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
 void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
 				const char *name);
 
+#define __btrfs_fs_incompat(fs_info, flags)				\
+	(!!(btrfs_super_incompat_flags((fs_info)->super_copy) & (flags)))
+
+#define __btrfs_fs_compat_ro(fs_info, flags)				\
+	(!!(btrfs_super_compat_ro_flags((fs_info)->super_copy) & (flags)))
+
 #define btrfs_set_fs_incompat(__fs_info, opt)				\
 	__btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, #opt)
 
@@ -62,20 +68,6 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
 #define btrfs_fs_compat_ro(fs_info, opt)				\
 	__btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
 
-static inline bool __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag)
-{
-	struct btrfs_super_block *disk_super;
-	disk_super = fs_info->super_copy;
-	return !!(btrfs_super_incompat_flags(disk_super) & flag);
-}
-
-static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag)
-{
-	struct btrfs_super_block *disk_super;
-	disk_super = fs_info->super_copy;
-	return !!(btrfs_super_compat_ro_flags(disk_super) & flag);
-}
-
 static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
 {
 	/* Do it this way so we only ever do one test_bit in the normal case. */
-- 
cgit 


From fc97a410bd781d00634194bfcddfd2be4185d954 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 19 Oct 2022 10:50:53 -0400
Subject: btrfs: move mount option definitions to fs.h

These are fs wide definitions and helpers, move them out of ctree.h and
into fs.h.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-rsv.c   |  1 +
 fs/btrfs/ctree.h       | 63 --------------------------------------------------
 fs/btrfs/delayed-ref.c |  1 +
 fs/btrfs/discard.c     |  1 +
 fs/btrfs/fs.h          | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/ref-verify.c  |  1 +
 6 files changed, 67 insertions(+), 63 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index 89e3e7d1bff6..a48fa7ac74cc 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -7,6 +7,7 @@
 #include "transaction.h"
 #include "block-group.h"
 #include "disk-io.h"
+#include "fs.h"
 
 /*
  * HOW DO BLOCK RESERVES WORK
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6f1eefbe3691..86af9d36fdf7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1328,69 +1328,6 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
 	return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item);
 }
 
-/*
- * Flags for mount options.
- *
- * Note: don't forget to add new options to btrfs_show_options()
- */
-enum {
-	BTRFS_MOUNT_NODATASUM			= (1UL << 0),
-	BTRFS_MOUNT_NODATACOW			= (1UL << 1),
-	BTRFS_MOUNT_NOBARRIER			= (1UL << 2),
-	BTRFS_MOUNT_SSD				= (1UL << 3),
-	BTRFS_MOUNT_DEGRADED			= (1UL << 4),
-	BTRFS_MOUNT_COMPRESS			= (1UL << 5),
-	BTRFS_MOUNT_NOTREELOG   		= (1UL << 6),
-	BTRFS_MOUNT_FLUSHONCOMMIT		= (1UL << 7),
-	BTRFS_MOUNT_SSD_SPREAD			= (1UL << 8),
-	BTRFS_MOUNT_NOSSD			= (1UL << 9),
-	BTRFS_MOUNT_DISCARD_SYNC		= (1UL << 10),
-	BTRFS_MOUNT_FORCE_COMPRESS      	= (1UL << 11),
-	BTRFS_MOUNT_SPACE_CACHE			= (1UL << 12),
-	BTRFS_MOUNT_CLEAR_CACHE			= (1UL << 13),
-	BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED	= (1UL << 14),
-	BTRFS_MOUNT_ENOSPC_DEBUG		= (1UL << 15),
-	BTRFS_MOUNT_AUTO_DEFRAG			= (1UL << 16),
-	BTRFS_MOUNT_USEBACKUPROOT		= (1UL << 17),
-	BTRFS_MOUNT_SKIP_BALANCE		= (1UL << 18),
-	BTRFS_MOUNT_CHECK_INTEGRITY		= (1UL << 19),
-	BTRFS_MOUNT_CHECK_INTEGRITY_DATA	= (1UL << 20),
-	BTRFS_MOUNT_PANIC_ON_FATAL_ERROR	= (1UL << 21),
-	BTRFS_MOUNT_RESCAN_UUID_TREE		= (1UL << 22),
-	BTRFS_MOUNT_FRAGMENT_DATA		= (1UL << 23),
-	BTRFS_MOUNT_FRAGMENT_METADATA		= (1UL << 24),
-	BTRFS_MOUNT_FREE_SPACE_TREE		= (1UL << 25),
-	BTRFS_MOUNT_NOLOGREPLAY			= (1UL << 26),
-	BTRFS_MOUNT_REF_VERIFY			= (1UL << 27),
-	BTRFS_MOUNT_DISCARD_ASYNC		= (1UL << 28),
-	BTRFS_MOUNT_IGNOREBADROOTS		= (1UL << 29),
-	BTRFS_MOUNT_IGNOREDATACSUMS		= (1UL << 30),
-	BTRFS_MOUNT_NODISCARD			= (1UL << 31),
-};
-
-#define BTRFS_DEFAULT_COMMIT_INTERVAL	(30)
-#define BTRFS_DEFAULT_MAX_INLINE	(2048)
-
-#define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
-#define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
-#define btrfs_raw_test_opt(o, opt)	((o) & BTRFS_MOUNT_##opt)
-#define btrfs_test_opt(fs_info, opt)	((fs_info)->mount_opt & \
-					 BTRFS_MOUNT_##opt)
-
-#define btrfs_set_and_info(fs_info, opt, fmt, args...)			\
-do {									\
-	if (!btrfs_test_opt(fs_info, opt))				\
-		btrfs_info(fs_info, fmt, ##args);			\
-	btrfs_set_opt(fs_info->mount_opt, opt);				\
-} while (0)
-
-#define btrfs_clear_and_info(fs_info, opt, fmt, args...)		\
-do {									\
-	if (btrfs_test_opt(fs_info, opt))				\
-		btrfs_info(fs_info, fmt, ##args);			\
-	btrfs_clear_opt(fs_info->mount_opt, opt);			\
-} while (0)
-
 /*
  * Requests for changes that need to be done during transaction commit.
  *
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index c775ff4f1cb1..010cc16297d8 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -13,6 +13,7 @@
 #include "qgroup.h"
 #include "space-info.h"
 #include "tree-mod-log.h"
+#include "fs.h"
 
 struct kmem_cache *btrfs_delayed_ref_head_cachep;
 struct kmem_cache *btrfs_delayed_tree_ref_cachep;
diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
index e1b7bd927d69..51f0ef386046 100644
--- a/fs/btrfs/discard.c
+++ b/fs/btrfs/discard.c
@@ -11,6 +11,7 @@
 #include "block-group.h"
 #include "discard.h"
 #include "free-space-cache.h"
+#include "fs.h"
 
 /*
  * This contains the logic to handle async discard.
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 682542ed964f..4799812a7351 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -34,6 +34,49 @@ enum {
 	BTRFS_FS_STATE_COUNT
 };
 
+/*
+ * Flags for mount options.
+ *
+ * Note: don't forget to add new options to btrfs_show_options()
+ */
+enum {
+	BTRFS_MOUNT_NODATASUM			= (1UL << 0),
+	BTRFS_MOUNT_NODATACOW			= (1UL << 1),
+	BTRFS_MOUNT_NOBARRIER			= (1UL << 2),
+	BTRFS_MOUNT_SSD				= (1UL << 3),
+	BTRFS_MOUNT_DEGRADED			= (1UL << 4),
+	BTRFS_MOUNT_COMPRESS			= (1UL << 5),
+	BTRFS_MOUNT_NOTREELOG   		= (1UL << 6),
+	BTRFS_MOUNT_FLUSHONCOMMIT		= (1UL << 7),
+	BTRFS_MOUNT_SSD_SPREAD			= (1UL << 8),
+	BTRFS_MOUNT_NOSSD			= (1UL << 9),
+	BTRFS_MOUNT_DISCARD_SYNC		= (1UL << 10),
+	BTRFS_MOUNT_FORCE_COMPRESS      	= (1UL << 11),
+	BTRFS_MOUNT_SPACE_CACHE			= (1UL << 12),
+	BTRFS_MOUNT_CLEAR_CACHE			= (1UL << 13),
+	BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED	= (1UL << 14),
+	BTRFS_MOUNT_ENOSPC_DEBUG		= (1UL << 15),
+	BTRFS_MOUNT_AUTO_DEFRAG			= (1UL << 16),
+	BTRFS_MOUNT_USEBACKUPROOT		= (1UL << 17),
+	BTRFS_MOUNT_SKIP_BALANCE		= (1UL << 18),
+	BTRFS_MOUNT_CHECK_INTEGRITY		= (1UL << 19),
+	BTRFS_MOUNT_CHECK_INTEGRITY_DATA	= (1UL << 20),
+	BTRFS_MOUNT_PANIC_ON_FATAL_ERROR	= (1UL << 21),
+	BTRFS_MOUNT_RESCAN_UUID_TREE		= (1UL << 22),
+	BTRFS_MOUNT_FRAGMENT_DATA		= (1UL << 23),
+	BTRFS_MOUNT_FRAGMENT_METADATA		= (1UL << 24),
+	BTRFS_MOUNT_FREE_SPACE_TREE		= (1UL << 25),
+	BTRFS_MOUNT_NOLOGREPLAY			= (1UL << 26),
+	BTRFS_MOUNT_REF_VERIFY			= (1UL << 27),
+	BTRFS_MOUNT_DISCARD_ASYNC		= (1UL << 28),
+	BTRFS_MOUNT_IGNOREBADROOTS		= (1UL << 29),
+	BTRFS_MOUNT_IGNOREDATACSUMS		= (1UL << 30),
+	BTRFS_MOUNT_NODISCARD			= (1UL << 31),
+};
+
+#define BTRFS_DEFAULT_COMMIT_INTERVAL	(30)
+#define BTRFS_DEFAULT_MAX_INLINE	(2048)
+
 /* Compatibility and incompatibility defines */
 void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
 			     const char *name);
@@ -68,6 +111,26 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
 #define btrfs_fs_compat_ro(fs_info, opt)				\
 	__btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
 
+#define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
+#define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
+#define btrfs_raw_test_opt(o, opt)	((o) & BTRFS_MOUNT_##opt)
+#define btrfs_test_opt(fs_info, opt)	((fs_info)->mount_opt & \
+					 BTRFS_MOUNT_##opt)
+
+#define btrfs_set_and_info(fs_info, opt, fmt, args...)			\
+do {									\
+	if (!btrfs_test_opt(fs_info, opt))				\
+		btrfs_info(fs_info, fmt, ##args);			\
+	btrfs_set_opt(fs_info->mount_opt, opt);				\
+} while (0)
+
+#define btrfs_clear_and_info(fs_info, opt, fmt, args...)		\
+do {									\
+	if (btrfs_test_opt(fs_info, opt))				\
+		btrfs_info(fs_info, fmt, ##args);			\
+	btrfs_clear_opt(fs_info->mount_opt, opt);			\
+} while (0)
+
 static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
 {
 	/* Do it this way so we only ever do one test_bit in the normal case. */
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index f7535b8b62f5..9b09dc50ba14 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -11,6 +11,7 @@
 #include "locking.h"
 #include "delayed-ref.h"
 #include "ref-verify.h"
+#include "fs.h"
 
 /*
  * Used to keep track the roots and number of refs each root has for a given
-- 
cgit 


From 7966a6b5959bbeb38b35d12b7a533c1dee8c432c Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 19 Oct 2022 10:50:54 -0400
Subject: btrfs: move fs_info::flags enum to fs.h

These definitions are fs wide, take them out of ctree.h and put them in
fs.h.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h              | 68 -------------------------------------------
 fs/btrfs/fs.h                 | 68 +++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/tests/qgroup-tests.c |  1 +
 fs/btrfs/tree-mod-log.c       |  1 +
 4 files changed, 70 insertions(+), 68 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 86af9d36fdf7..149c185f4314 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -280,69 +280,6 @@ struct btrfs_discard_ctl {
 	atomic64_t discard_bytes_saved;
 };
 
-enum {
-	BTRFS_FS_CLOSING_START,
-	BTRFS_FS_CLOSING_DONE,
-	BTRFS_FS_LOG_RECOVERING,
-	BTRFS_FS_OPEN,
-	BTRFS_FS_QUOTA_ENABLED,
-	BTRFS_FS_UPDATE_UUID_TREE_GEN,
-	BTRFS_FS_CREATING_FREE_SPACE_TREE,
-	BTRFS_FS_BTREE_ERR,
-	BTRFS_FS_LOG1_ERR,
-	BTRFS_FS_LOG2_ERR,
-	BTRFS_FS_QUOTA_OVERRIDE,
-	/* Used to record internally whether fs has been frozen */
-	BTRFS_FS_FROZEN,
-	/*
-	 * Indicate that balance has been set up from the ioctl and is in the
-	 * main phase. The fs_info::balance_ctl is initialized.
-	 */
-	BTRFS_FS_BALANCE_RUNNING,
-
-	/*
-	 * Indicate that relocation of a chunk has started, it's set per chunk
-	 * and is toggled between chunks.
-	 */
-	BTRFS_FS_RELOC_RUNNING,
-
-	/* Indicate that the cleaner thread is awake and doing something. */
-	BTRFS_FS_CLEANER_RUNNING,
-
-	/*
-	 * The checksumming has an optimized version and is considered fast,
-	 * so we don't need to offload checksums to workqueues.
-	 */
-	BTRFS_FS_CSUM_IMPL_FAST,
-
-	/* Indicate that the discard workqueue can service discards. */
-	BTRFS_FS_DISCARD_RUNNING,
-
-	/* Indicate that we need to cleanup space cache v1 */
-	BTRFS_FS_CLEANUP_SPACE_CACHE_V1,
-
-	/* Indicate that we can't trust the free space tree for caching yet */
-	BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED,
-
-	/* Indicate whether there are any tree modification log users */
-	BTRFS_FS_TREE_MOD_LOG_USERS,
-
-	/* Indicate that we want the transaction kthread to commit right now. */
-	BTRFS_FS_COMMIT_TRANS,
-
-	/* Indicate we have half completed snapshot deletions pending. */
-	BTRFS_FS_UNFINISHED_DROPS,
-
-	/* Indicate we have to finish a zone to do next allocation. */
-	BTRFS_FS_NEED_ZONE_FINISH,
-
-#if BITS_PER_LONG == 32
-	/* Indicate if we have error/warn message printed on 32bit systems */
-	BTRFS_FS_32BIT_ERROR,
-	BTRFS_FS_32BIT_WARN,
-#endif
-};
-
 /*
  * Exclusive operations (device replace, resize, device add/remove, balance)
  */
@@ -1037,11 +974,6 @@ enum btrfs_lockdep_trans_states {
 				 &lock##_key, 0);				\
 	} while (0)
 
-static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
-{
-	clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
-}
-
 /*
  * Record swapped tree blocks of a subvolume tree for delayed subtree trace
  * code. For detail check comment in fs/btrfs/qgroup.c.
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 4799812a7351..7337707d3939 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -34,6 +34,69 @@ enum {
 	BTRFS_FS_STATE_COUNT
 };
 
+enum {
+	BTRFS_FS_CLOSING_START,
+	BTRFS_FS_CLOSING_DONE,
+	BTRFS_FS_LOG_RECOVERING,
+	BTRFS_FS_OPEN,
+	BTRFS_FS_QUOTA_ENABLED,
+	BTRFS_FS_UPDATE_UUID_TREE_GEN,
+	BTRFS_FS_CREATING_FREE_SPACE_TREE,
+	BTRFS_FS_BTREE_ERR,
+	BTRFS_FS_LOG1_ERR,
+	BTRFS_FS_LOG2_ERR,
+	BTRFS_FS_QUOTA_OVERRIDE,
+	/* Used to record internally whether fs has been frozen */
+	BTRFS_FS_FROZEN,
+	/*
+	 * Indicate that balance has been set up from the ioctl and is in the
+	 * main phase. The fs_info::balance_ctl is initialized.
+	 */
+	BTRFS_FS_BALANCE_RUNNING,
+
+	/*
+	 * Indicate that relocation of a chunk has started, it's set per chunk
+	 * and is toggled between chunks.
+	 */
+	BTRFS_FS_RELOC_RUNNING,
+
+	/* Indicate that the cleaner thread is awake and doing something. */
+	BTRFS_FS_CLEANER_RUNNING,
+
+	/*
+	 * The checksumming has an optimized version and is considered fast,
+	 * so we don't need to offload checksums to workqueues.
+	 */
+	BTRFS_FS_CSUM_IMPL_FAST,
+
+	/* Indicate that the discard workqueue can service discards. */
+	BTRFS_FS_DISCARD_RUNNING,
+
+	/* Indicate that we need to cleanup space cache v1 */
+	BTRFS_FS_CLEANUP_SPACE_CACHE_V1,
+
+	/* Indicate that we can't trust the free space tree for caching yet */
+	BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED,
+
+	/* Indicate whether there are any tree modification log users */
+	BTRFS_FS_TREE_MOD_LOG_USERS,
+
+	/* Indicate that we want the transaction kthread to commit right now. */
+	BTRFS_FS_COMMIT_TRANS,
+
+	/* Indicate we have half completed snapshot deletions pending. */
+	BTRFS_FS_UNFINISHED_DROPS,
+
+	/* Indicate we have to finish a zone to do next allocation. */
+	BTRFS_FS_NEED_ZONE_FINISH,
+
+#if BITS_PER_LONG == 32
+	/* Indicate if we have error/warn message printed on 32bit systems */
+	BTRFS_FS_32BIT_ERROR,
+	BTRFS_FS_32BIT_WARN,
+#endif
+};
+
 /*
  * Flags for mount options.
  *
@@ -168,6 +231,11 @@ static inline void btrfs_clear_sb_rdonly(struct super_block *sb)
 	clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state);
 }
 
+static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
+{
+	clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
+}
+
 #define BTRFS_FS_ERROR(fs_info)	(unlikely(test_bit(BTRFS_FS_STATE_ERROR, \
 						   &(fs_info)->fs_state)))
 #define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info)				\
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 63676ea19f29..fd0e0ade8154 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -10,6 +10,7 @@
 #include "../disk-io.h"
 #include "../qgroup.h"
 #include "../backref.h"
+#include "../fs.h"
 
 static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
 				  u64 num_bytes, u64 parent, u64 root_objectid)
diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c
index 41e5d9f789dc..69083889a9b8 100644
--- a/fs/btrfs/tree-mod-log.c
+++ b/fs/btrfs/tree-mod-log.c
@@ -3,6 +3,7 @@
 #include "messages.h"
 #include "tree-mod-log.h"
 #include "disk-io.h"
+#include "fs.h"
 
 struct tree_mod_root {
 	u64 logical;
-- 
cgit 


From c52cc7b7acfb3290a0268538b82ac7cf18df7ca4 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 19 Oct 2022 10:50:55 -0400
Subject: btrfs: add a BTRFS_FS_NEED_TRANS_COMMIT flag

Currently we are only using fs_info->pending_changes to indicate that we
need a transaction commit.  The original users for this were removed
years ago and we don't have more usage in sight, so this is the only
remaining reason to have this field.  Add a flag so we can remove this
code.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/fs.h          | 3 +++
 fs/btrfs/super.c       | 3 ++-
 fs/btrfs/sysfs.c       | 4 ++--
 fs/btrfs/transaction.c | 2 ++
 4 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 7337707d3939..bc5de4d598ad 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -90,6 +90,9 @@ enum {
 	/* Indicate we have to finish a zone to do next allocation. */
 	BTRFS_FS_NEED_ZONE_FINISH,
 
+	/* Indicate that we want to commit the transaction. */
+	BTRFS_FS_NEED_TRANS_COMMIT,
+
 #if BITS_PER_LONG == 32
 	/* Indicate if we have error/warn message printed on 32bit systems */
 	BTRFS_FS_32BIT_ERROR,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 3aeb2106e119..0839e4a1cfac 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1535,7 +1535,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 			 * Exit unless we have some pending changes
 			 * that need to go through commit
 			 */
-			if (fs_info->pending_changes == 0)
+			if (!test_bit(BTRFS_FS_NEED_TRANS_COMMIT,
+				      &fs_info->flags))
 				return 0;
 			/*
 			 * A non-blocking test if the fs is frozen. We must not
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 31fb6cb389b1..47b221372dcf 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -249,7 +249,7 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
 	/*
 	 * We don't want to do full transaction commit from inside sysfs
 	 */
-	btrfs_set_pending(fs_info, COMMIT);
+	set_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags);
 	wake_up_process(fs_info->transaction_kthread);
 
 	return count;
@@ -960,7 +960,7 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
 	/*
 	 * We don't want to do full transaction commit from inside sysfs
 	 */
-	btrfs_set_pending(fs_info, COMMIT);
+	set_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags);
 	wake_up_process(fs_info->transaction_kthread);
 
 	return len;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index bae77fb05e2b..7b6b68ab089a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -2104,6 +2104,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	ASSERT(refcount_read(&trans->use_count) == 1);
 	btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
 
+	clear_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags);
+
 	/* Stop the commit early if ->aborted is set */
 	if (TRANS_ABORTED(cur_trans)) {
 		ret = cur_trans->aborted;
-- 
cgit 


From 55e5cfd36da5d71e21b72a5922c9b6c349744c2a Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 19 Oct 2022 10:50:56 -0400
Subject: btrfs: remove fs_info::pending_changes and related code

Now that we're not using this code anywhere we can remove it as well as
the member from fs_info.

We don't have any mount options or on/off features that would utilize
the pending infrastructure, the last one was inode_cache.
There was a patchset [1] to enable some features from sysfs that would
break things if it would be set immediately. In case we'll need that
kind of logic again the patch can be reverted, but for the current use
it can be replaced by the single state bit to do the commit.

[1] https://lore.kernel.org/linux-btrfs/1422609654-19519-1-git-send-email-quwenruo@cn.fujitsu.com/

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ add note ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h       | 23 +----------------------
 fs/btrfs/disk-io.c     |  6 ------
 fs/btrfs/transaction.c | 25 -------------------------
 fs/btrfs/transaction.h |  1 -
 4 files changed, 1 insertion(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 149c185f4314..c5f2f387989e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -373,11 +373,7 @@ struct btrfs_fs_info {
 	 */
 	u64 last_trans_log_full_commit;
 	unsigned long mount_opt;
-	/*
-	 * Track requests for actions that need to be done during transaction
-	 * commit (like for some mount options).
-	 */
-	unsigned long pending_changes;
+
 	unsigned long compress_type:4;
 	unsigned int compress_level;
 	u32 commit_interval;
@@ -1260,23 +1256,6 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
 	return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item);
 }
 
-/*
- * Requests for changes that need to be done during transaction commit.
- *
- * Internal mount options that are used for special handling of the real
- * mount options (eg. cannot be set during remount and have to be set during
- * transaction commit)
- */
-
-#define BTRFS_PENDING_COMMIT			(0)
-
-#define btrfs_test_pending(info, opt)	\
-	test_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
-#define btrfs_set_pending(info, opt)	\
-	set_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
-#define btrfs_clear_pending(info, opt)	\
-	clear_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
-
 struct btrfs_map_token {
 	struct extent_buffer *eb;
 	char *kaddr;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5e20b191b108..2c38b4eebd8f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3761,12 +3761,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 		btrfs_clear_opt(fs_info->mount_opt, NODISCARD);
 	}
 
-	/*
-	 * Mount does not set all options immediately, we can do it now and do
-	 * not have to wait for transaction commit
-	 */
-	btrfs_apply_pending_changes(fs_info);
-
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
 	if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) {
 		ret = btrfsic_mount(fs_info, fs_devices,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 7b6b68ab089a..37d0baaa41d8 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -2359,12 +2359,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	if (ret)
 		goto unlock_reloc;
 
-	/*
-	 * Since the transaction is done, we can apply the pending changes
-	 * before the next transaction.
-	 */
-	btrfs_apply_pending_changes(fs_info);
-
 	/* commit_fs_roots gets rid of all the tree log roots, it is now
 	 * safe to free the root of tree log roots
 	 */
@@ -2587,25 +2581,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info)
 	return (ret < 0) ? 0 : 1;
 }
 
-void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info)
-{
-	unsigned long prev;
-	unsigned long bit;
-
-	prev = xchg(&fs_info->pending_changes, 0);
-	if (!prev)
-		return;
-
-	bit = 1 << BTRFS_PENDING_COMMIT;
-	if (prev & bit)
-		btrfs_debug(fs_info, "pending commit done");
-	prev &= ~bit;
-
-	if (prev)
-		btrfs_warn(fs_info,
-			"unknown pending changes left 0x%lx, ignoring", prev);
-}
-
 int __init btrfs_transaction_init(void)
 {
 	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index b39ebf98af60..97f6c39f59c8 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -233,7 +233,6 @@ int btrfs_wait_tree_log_extents(struct btrfs_root *root, int mark);
 int btrfs_transaction_blocked(struct btrfs_fs_info *info);
 int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
 void btrfs_put_transaction(struct btrfs_transaction *transaction);
-void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info);
 void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root);
 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
-- 
cgit 


From d83eb482b727e7d9fdf6f729db6da3642c4d8a58 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 19 Oct 2022 10:50:57 -0400
Subject: btrfs: move the compat/incompat flag masks to fs.h

This is fs wide information, move it out of ctree.h into fs.h.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h | 57 --------------------------------------------------------
 fs/btrfs/fs.h    | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c5f2f387989e..af6b72c7e33f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -78,63 +78,6 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
  */
 #define BTRFS_DEVICE_RANGE_RESERVED			(SZ_1M)
 
-/*
- * Compat flags that we support.  If any incompat flags are set other than the
- * ones specified below then we will fail to mount
- */
-#define BTRFS_FEATURE_COMPAT_SUPP		0ULL
-#define BTRFS_FEATURE_COMPAT_SAFE_SET		0ULL
-#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR		0ULL
-
-#define BTRFS_FEATURE_COMPAT_RO_SUPP			\
-	(BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE |	\
-	 BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \
-	 BTRFS_FEATURE_COMPAT_RO_VERITY |		\
-	 BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE)
-
-#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET	0ULL
-#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR	0ULL
-
-#ifdef CONFIG_BTRFS_DEBUG
-/*
- * Extent tree v2 supported only with CONFIG_BTRFS_DEBUG
- */
-#define BTRFS_FEATURE_INCOMPAT_SUPP			\
-	(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF |		\
-	 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL |	\
-	 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |		\
-	 BTRFS_FEATURE_INCOMPAT_BIG_METADATA |		\
-	 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO |		\
-	 BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD |		\
-	 BTRFS_FEATURE_INCOMPAT_RAID56 |		\
-	 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF |		\
-	 BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA |	\
-	 BTRFS_FEATURE_INCOMPAT_NO_HOLES	|	\
-	 BTRFS_FEATURE_INCOMPAT_METADATA_UUID	|	\
-	 BTRFS_FEATURE_INCOMPAT_RAID1C34	|	\
-	 BTRFS_FEATURE_INCOMPAT_ZONED		|	\
-	 BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2)
-#else
-#define BTRFS_FEATURE_INCOMPAT_SUPP			\
-	(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF |		\
-	 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL |	\
-	 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |		\
-	 BTRFS_FEATURE_INCOMPAT_BIG_METADATA |		\
-	 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO |		\
-	 BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD |		\
-	 BTRFS_FEATURE_INCOMPAT_RAID56 |		\
-	 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF |		\
-	 BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA |	\
-	 BTRFS_FEATURE_INCOMPAT_NO_HOLES	|	\
-	 BTRFS_FEATURE_INCOMPAT_METADATA_UUID	|	\
-	 BTRFS_FEATURE_INCOMPAT_RAID1C34	|	\
-	 BTRFS_FEATURE_INCOMPAT_ZONED)
-#endif
-
-#define BTRFS_FEATURE_INCOMPAT_SAFE_SET			\
-	(BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
-#define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR		0ULL
-
 /* Read ahead values for struct btrfs_path.reada */
 enum {
 	READA_NONE,
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index bc5de4d598ad..af356feec0c7 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -140,6 +140,63 @@ enum {
 	BTRFS_MOUNT_NODISCARD			= (1UL << 31),
 };
 
+/*
+ * Compat flags that we support.  If any incompat flags are set other than the
+ * ones specified below then we will fail to mount
+ */
+#define BTRFS_FEATURE_COMPAT_SUPP		0ULL
+#define BTRFS_FEATURE_COMPAT_SAFE_SET		0ULL
+#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR		0ULL
+
+#define BTRFS_FEATURE_COMPAT_RO_SUPP			\
+	(BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE |	\
+	 BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \
+	 BTRFS_FEATURE_COMPAT_RO_VERITY |		\
+	 BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE)
+
+#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET	0ULL
+#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR	0ULL
+
+#ifdef CONFIG_BTRFS_DEBUG
+/*
+ * Extent tree v2 supported only with CONFIG_BTRFS_DEBUG
+ */
+#define BTRFS_FEATURE_INCOMPAT_SUPP			\
+	(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF |		\
+	 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL |	\
+	 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |		\
+	 BTRFS_FEATURE_INCOMPAT_BIG_METADATA |		\
+	 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO |		\
+	 BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD |		\
+	 BTRFS_FEATURE_INCOMPAT_RAID56 |		\
+	 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF |		\
+	 BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA |	\
+	 BTRFS_FEATURE_INCOMPAT_NO_HOLES	|	\
+	 BTRFS_FEATURE_INCOMPAT_METADATA_UUID	|	\
+	 BTRFS_FEATURE_INCOMPAT_RAID1C34	|	\
+	 BTRFS_FEATURE_INCOMPAT_ZONED		|	\
+	 BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2)
+#else
+#define BTRFS_FEATURE_INCOMPAT_SUPP			\
+	(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF |		\
+	 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL |	\
+	 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |		\
+	 BTRFS_FEATURE_INCOMPAT_BIG_METADATA |		\
+	 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO |		\
+	 BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD |		\
+	 BTRFS_FEATURE_INCOMPAT_RAID56 |		\
+	 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF |		\
+	 BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA |	\
+	 BTRFS_FEATURE_INCOMPAT_NO_HOLES	|	\
+	 BTRFS_FEATURE_INCOMPAT_METADATA_UUID	|	\
+	 BTRFS_FEATURE_INCOMPAT_RAID1C34	|	\
+	 BTRFS_FEATURE_INCOMPAT_ZONED)
+#endif
+
+#define BTRFS_FEATURE_INCOMPAT_SAFE_SET			\
+	(BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
+#define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR		0ULL
+
 #define BTRFS_DEFAULT_COMMIT_INTERVAL	(30)
 #define BTRFS_DEFAULT_MAX_INLINE	(2048)
 
-- 
cgit 


From 818fe33aed42ddd5052171328a3f708e98357e10 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 19 Oct 2022 10:50:58 -0400
Subject: btrfs: rename struct-funcs.c to accessors.c

Rename struct-funcs.c to accessors.c so we can move the item accessors
out of ctree.h. accessors.c is a better description of the code that is
contained in these files.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/Makefile       |   2 +-
 fs/btrfs/accessors.c    | 166 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/struct-funcs.c | 167 ------------------------------------------------
 3 files changed, 167 insertions(+), 168 deletions(-)
 create mode 100644 fs/btrfs/accessors.c
 delete mode 100644 fs/btrfs/struct-funcs.c

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index eebb45c06485..76f90dcfb14d 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -24,7 +24,7 @@ obj-$(CONFIG_BTRFS_FS) := btrfs.o
 btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   file-item.o inode-item.o disk-io.o \
 	   transaction.o inode.o file.o tree-defrag.o \
-	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
+	   extent_map.o sysfs.o accessors.o xattr.o ordered-data.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
 	   export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \
 	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c
new file mode 100644
index 000000000000..118bfd1c0e3e
--- /dev/null
+++ b/fs/btrfs/accessors.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ */
+
+#include <asm/unaligned.h>
+#include "messages.h"
+#include "ctree.h"
+
+static bool check_setget_bounds(const struct extent_buffer *eb,
+				const void *ptr, unsigned off, int size)
+{
+	const unsigned long member_offset = (unsigned long)ptr + off;
+
+	if (unlikely(member_offset + size > eb->len)) {
+		btrfs_warn(eb->fs_info,
+		"bad eb member %s: ptr 0x%lx start %llu member offset %lu size %d",
+			(member_offset > eb->len ? "start" : "end"),
+			(unsigned long)ptr, eb->start, member_offset, size);
+		return false;
+	}
+
+	return true;
+}
+
+/*
+ * Macro templates that define helpers to read/write extent buffer data of a
+ * given size, that are also used via ctree.h for access to item members by
+ * specialized helpers.
+ *
+ * Generic helpers:
+ * - btrfs_set_8 (for 8/16/32/64)
+ * - btrfs_get_8 (for 8/16/32/64)
+ *
+ * Generic helpers with a token (cached address of the most recently accessed
+ * page):
+ * - btrfs_set_token_8 (for 8/16/32/64)
+ * - btrfs_get_token_8 (for 8/16/32/64)
+ *
+ * The set/get functions handle data spanning two pages transparently, in case
+ * metadata block size is larger than page.  Every pointer to metadata items is
+ * an offset into the extent buffer page array, cast to a specific type.  This
+ * gives us all the type checking.
+ *
+ * The extent buffer pages stored in the array pages do not form a contiguous
+ * phyusical range, but the API functions assume the linear offset to the range
+ * from 0 to metadata node size.
+ */
+
+#define DEFINE_BTRFS_SETGET_BITS(bits)					\
+u##bits btrfs_get_token_##bits(struct btrfs_map_token *token,		\
+			       const void *ptr, unsigned long off)	\
+{									\
+	const unsigned long member_offset = (unsigned long)ptr + off;	\
+	const unsigned long idx = get_eb_page_index(member_offset);	\
+	const unsigned long oip = get_eb_offset_in_page(token->eb,	\
+							member_offset);	\
+	const int size = sizeof(u##bits);				\
+	u8 lebytes[sizeof(u##bits)];					\
+	const int part = PAGE_SIZE - oip;				\
+									\
+	ASSERT(token);							\
+	ASSERT(token->kaddr);						\
+	ASSERT(check_setget_bounds(token->eb, ptr, off, size));		\
+	if (token->offset <= member_offset &&				\
+	    member_offset + size <= token->offset + PAGE_SIZE) {	\
+		return get_unaligned_le##bits(token->kaddr + oip);	\
+	}								\
+	token->kaddr = page_address(token->eb->pages[idx]);		\
+	token->offset = idx << PAGE_SHIFT;				\
+	if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE ) \
+		return get_unaligned_le##bits(token->kaddr + oip);	\
+									\
+	memcpy(lebytes, token->kaddr + oip, part);			\
+	token->kaddr = page_address(token->eb->pages[idx + 1]);		\
+	token->offset = (idx + 1) << PAGE_SHIFT;			\
+	memcpy(lebytes + part, token->kaddr, size - part);		\
+	return get_unaligned_le##bits(lebytes);				\
+}									\
+u##bits btrfs_get_##bits(const struct extent_buffer *eb,		\
+			 const void *ptr, unsigned long off)		\
+{									\
+	const unsigned long member_offset = (unsigned long)ptr + off;	\
+	const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \
+	const unsigned long idx = get_eb_page_index(member_offset);	\
+	char *kaddr = page_address(eb->pages[idx]);			\
+	const int size = sizeof(u##bits);				\
+	const int part = PAGE_SIZE - oip;				\
+	u8 lebytes[sizeof(u##bits)];					\
+									\
+	ASSERT(check_setget_bounds(eb, ptr, off, size));		\
+	if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE)	\
+		return get_unaligned_le##bits(kaddr + oip);		\
+									\
+	memcpy(lebytes, kaddr + oip, part);				\
+	kaddr = page_address(eb->pages[idx + 1]);			\
+	memcpy(lebytes + part, kaddr, size - part);			\
+	return get_unaligned_le##bits(lebytes);				\
+}									\
+void btrfs_set_token_##bits(struct btrfs_map_token *token,		\
+			    const void *ptr, unsigned long off,		\
+			    u##bits val)				\
+{									\
+	const unsigned long member_offset = (unsigned long)ptr + off;	\
+	const unsigned long idx = get_eb_page_index(member_offset);	\
+	const unsigned long oip = get_eb_offset_in_page(token->eb,	\
+							member_offset);	\
+	const int size = sizeof(u##bits);				\
+	u8 lebytes[sizeof(u##bits)];					\
+	const int part = PAGE_SIZE - oip;				\
+									\
+	ASSERT(token);							\
+	ASSERT(token->kaddr);						\
+	ASSERT(check_setget_bounds(token->eb, ptr, off, size));		\
+	if (token->offset <= member_offset &&				\
+	    member_offset + size <= token->offset + PAGE_SIZE) {	\
+		put_unaligned_le##bits(val, token->kaddr + oip);	\
+		return;							\
+	}								\
+	token->kaddr = page_address(token->eb->pages[idx]);		\
+	token->offset = idx << PAGE_SHIFT;				\
+	if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \
+		put_unaligned_le##bits(val, token->kaddr + oip);	\
+		return;							\
+	}								\
+	put_unaligned_le##bits(val, lebytes);				\
+	memcpy(token->kaddr + oip, lebytes, part);			\
+	token->kaddr = page_address(token->eb->pages[idx + 1]);		\
+	token->offset = (idx + 1) << PAGE_SHIFT;			\
+	memcpy(token->kaddr, lebytes + part, size - part);		\
+}									\
+void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr,	\
+		      unsigned long off, u##bits val)			\
+{									\
+	const unsigned long member_offset = (unsigned long)ptr + off;	\
+	const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \
+	const unsigned long idx = get_eb_page_index(member_offset);	\
+	char *kaddr = page_address(eb->pages[idx]);			\
+	const int size = sizeof(u##bits);				\
+	const int part = PAGE_SIZE - oip;				\
+	u8 lebytes[sizeof(u##bits)];					\
+									\
+	ASSERT(check_setget_bounds(eb, ptr, off, size));		\
+	if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \
+		put_unaligned_le##bits(val, kaddr + oip);		\
+		return;							\
+	}								\
+									\
+	put_unaligned_le##bits(val, lebytes);				\
+	memcpy(kaddr + oip, lebytes, part);				\
+	kaddr = page_address(eb->pages[idx + 1]);			\
+	memcpy(kaddr, lebytes + part, size - part);			\
+}
+
+DEFINE_BTRFS_SETGET_BITS(8)
+DEFINE_BTRFS_SETGET_BITS(16)
+DEFINE_BTRFS_SETGET_BITS(32)
+DEFINE_BTRFS_SETGET_BITS(64)
+
+void btrfs_node_key(const struct extent_buffer *eb,
+		    struct btrfs_disk_key *disk_key, int nr)
+{
+	unsigned long ptr = btrfs_node_key_ptr_offset(nr);
+	read_eb_member(eb, (struct btrfs_key_ptr *)ptr,
+		       struct btrfs_key_ptr, key, disk_key);
+}
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
deleted file mode 100644
index 6ba16c018d7f..000000000000
--- a/fs/btrfs/struct-funcs.c
+++ /dev/null
@@ -1,167 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2007 Oracle.  All rights reserved.
- */
-
-#include <asm/unaligned.h>
-
-#include "messages.h"
-#include "ctree.h"
-
-static bool check_setget_bounds(const struct extent_buffer *eb,
-				const void *ptr, unsigned off, int size)
-{
-	const unsigned long member_offset = (unsigned long)ptr + off;
-
-	if (unlikely(member_offset + size > eb->len)) {
-		btrfs_warn(eb->fs_info,
-		"bad eb member %s: ptr 0x%lx start %llu member offset %lu size %d",
-			(member_offset > eb->len ? "start" : "end"),
-			(unsigned long)ptr, eb->start, member_offset, size);
-		return false;
-	}
-
-	return true;
-}
-
-/*
- * Macro templates that define helpers to read/write extent buffer data of a
- * given size, that are also used via ctree.h for access to item members by
- * specialized helpers.
- *
- * Generic helpers:
- * - btrfs_set_8 (for 8/16/32/64)
- * - btrfs_get_8 (for 8/16/32/64)
- *
- * Generic helpers with a token (cached address of the most recently accessed
- * page):
- * - btrfs_set_token_8 (for 8/16/32/64)
- * - btrfs_get_token_8 (for 8/16/32/64)
- *
- * The set/get functions handle data spanning two pages transparently, in case
- * metadata block size is larger than page.  Every pointer to metadata items is
- * an offset into the extent buffer page array, cast to a specific type.  This
- * gives us all the type checking.
- *
- * The extent buffer pages stored in the array pages do not form a contiguous
- * phyusical range, but the API functions assume the linear offset to the range
- * from 0 to metadata node size.
- */
-
-#define DEFINE_BTRFS_SETGET_BITS(bits)					\
-u##bits btrfs_get_token_##bits(struct btrfs_map_token *token,		\
-			       const void *ptr, unsigned long off)	\
-{									\
-	const unsigned long member_offset = (unsigned long)ptr + off;	\
-	const unsigned long idx = get_eb_page_index(member_offset);	\
-	const unsigned long oip = get_eb_offset_in_page(token->eb,	\
-							member_offset);	\
-	const int size = sizeof(u##bits);				\
-	u8 lebytes[sizeof(u##bits)];					\
-	const int part = PAGE_SIZE - oip;				\
-									\
-	ASSERT(token);							\
-	ASSERT(token->kaddr);						\
-	ASSERT(check_setget_bounds(token->eb, ptr, off, size));		\
-	if (token->offset <= member_offset &&				\
-	    member_offset + size <= token->offset + PAGE_SIZE) {	\
-		return get_unaligned_le##bits(token->kaddr + oip);	\
-	}								\
-	token->kaddr = page_address(token->eb->pages[idx]);		\
-	token->offset = idx << PAGE_SHIFT;				\
-	if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE ) \
-		return get_unaligned_le##bits(token->kaddr + oip);	\
-									\
-	memcpy(lebytes, token->kaddr + oip, part);			\
-	token->kaddr = page_address(token->eb->pages[idx + 1]);		\
-	token->offset = (idx + 1) << PAGE_SHIFT;			\
-	memcpy(lebytes + part, token->kaddr, size - part);		\
-	return get_unaligned_le##bits(lebytes);				\
-}									\
-u##bits btrfs_get_##bits(const struct extent_buffer *eb,		\
-			 const void *ptr, unsigned long off)		\
-{									\
-	const unsigned long member_offset = (unsigned long)ptr + off;	\
-	const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \
-	const unsigned long idx = get_eb_page_index(member_offset);	\
-	char *kaddr = page_address(eb->pages[idx]);			\
-	const int size = sizeof(u##bits);				\
-	const int part = PAGE_SIZE - oip;				\
-	u8 lebytes[sizeof(u##bits)];					\
-									\
-	ASSERT(check_setget_bounds(eb, ptr, off, size));		\
-	if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE)	\
-		return get_unaligned_le##bits(kaddr + oip);		\
-									\
-	memcpy(lebytes, kaddr + oip, part);				\
-	kaddr = page_address(eb->pages[idx + 1]);			\
-	memcpy(lebytes + part, kaddr, size - part);			\
-	return get_unaligned_le##bits(lebytes);				\
-}									\
-void btrfs_set_token_##bits(struct btrfs_map_token *token,		\
-			    const void *ptr, unsigned long off,		\
-			    u##bits val)				\
-{									\
-	const unsigned long member_offset = (unsigned long)ptr + off;	\
-	const unsigned long idx = get_eb_page_index(member_offset);	\
-	const unsigned long oip = get_eb_offset_in_page(token->eb,	\
-							member_offset);	\
-	const int size = sizeof(u##bits);				\
-	u8 lebytes[sizeof(u##bits)];					\
-	const int part = PAGE_SIZE - oip;				\
-									\
-	ASSERT(token);							\
-	ASSERT(token->kaddr);						\
-	ASSERT(check_setget_bounds(token->eb, ptr, off, size));		\
-	if (token->offset <= member_offset &&				\
-	    member_offset + size <= token->offset + PAGE_SIZE) {	\
-		put_unaligned_le##bits(val, token->kaddr + oip);	\
-		return;							\
-	}								\
-	token->kaddr = page_address(token->eb->pages[idx]);		\
-	token->offset = idx << PAGE_SHIFT;				\
-	if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \
-		put_unaligned_le##bits(val, token->kaddr + oip);	\
-		return;							\
-	}								\
-	put_unaligned_le##bits(val, lebytes);				\
-	memcpy(token->kaddr + oip, lebytes, part);			\
-	token->kaddr = page_address(token->eb->pages[idx + 1]);		\
-	token->offset = (idx + 1) << PAGE_SHIFT;			\
-	memcpy(token->kaddr, lebytes + part, size - part);		\
-}									\
-void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr,	\
-		      unsigned long off, u##bits val)			\
-{									\
-	const unsigned long member_offset = (unsigned long)ptr + off;	\
-	const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \
-	const unsigned long idx = get_eb_page_index(member_offset);	\
-	char *kaddr = page_address(eb->pages[idx]);			\
-	const int size = sizeof(u##bits);				\
-	const int part = PAGE_SIZE - oip;				\
-	u8 lebytes[sizeof(u##bits)];					\
-									\
-	ASSERT(check_setget_bounds(eb, ptr, off, size));		\
-	if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \
-		put_unaligned_le##bits(val, kaddr + oip);		\
-		return;							\
-	}								\
-									\
-	put_unaligned_le##bits(val, lebytes);				\
-	memcpy(kaddr + oip, lebytes, part);				\
-	kaddr = page_address(eb->pages[idx + 1]);			\
-	memcpy(kaddr, lebytes + part, size - part);			\
-}
-
-DEFINE_BTRFS_SETGET_BITS(8)
-DEFINE_BTRFS_SETGET_BITS(16)
-DEFINE_BTRFS_SETGET_BITS(32)
-DEFINE_BTRFS_SETGET_BITS(64)
-
-void btrfs_node_key(const struct extent_buffer *eb,
-		    struct btrfs_disk_key *disk_key, int nr)
-{
-	unsigned long ptr = btrfs_node_key_ptr_offset(nr);
-	read_eb_member(eb, (struct btrfs_key_ptr *)ptr,
-		       struct btrfs_key_ptr, key, disk_key);
-}
-- 
cgit 


From ad1ac5012c2beccc9de4d3e2cc1382fd250e5540 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 19 Oct 2022 10:50:59 -0400
Subject: btrfs: move btrfs_map_token to accessors

This is specific to the item-accessor code, move it out of ctree.h into
accessor.h/.c and then update the users to include the new header file.
This un-inlines btrfs_init_map_token, however this is only called once
per function so it's not critical to be inlined.  This also saves 904
bytes of code on a release build.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/accessors.c |  8 ++++++++
 fs/btrfs/accessors.h | 14 ++++++++++++++
 fs/btrfs/ctree.c     |  1 +
 fs/btrfs/ctree.h     | 16 ++--------------
 fs/btrfs/inode.c     |  1 +
 fs/btrfs/tree-log.c  |  1 +
 6 files changed, 27 insertions(+), 14 deletions(-)
 create mode 100644 fs/btrfs/accessors.h

(limited to 'fs')

diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c
index 118bfd1c0e3e..7a7b7d263102 100644
--- a/fs/btrfs/accessors.c
+++ b/fs/btrfs/accessors.c
@@ -6,6 +6,7 @@
 #include <asm/unaligned.h>
 #include "messages.h"
 #include "ctree.h"
+#include "accessors.h"
 
 static bool check_setget_bounds(const struct extent_buffer *eb,
 				const void *ptr, unsigned off, int size)
@@ -23,6 +24,13 @@ static bool check_setget_bounds(const struct extent_buffer *eb,
 	return true;
 }
 
+void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb)
+{
+	token->eb = eb;
+	token->kaddr = page_address(eb->pages[0]);
+	token->offset = 0;
+}
+
 /*
  * Macro templates that define helpers to read/write extent buffer data of a
  * given size, that are also used via ctree.h for access to item members by
diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
new file mode 100644
index 000000000000..a47b0f9d30ca
--- /dev/null
+++ b/fs/btrfs/accessors.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_ACCESSORS_H
+#define BTRFS_ACCESSORS_H
+
+struct btrfs_map_token {
+	struct extent_buffer *eb;
+	char *kaddr;
+	unsigned long offset;
+};
+
+void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb);
+
+#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 7ecb658500ce..1283ca46c5bc 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -19,6 +19,7 @@
 #include "tree-mod-log.h"
 #include "tree-checker.h"
 #include "fs.h"
+#include "accessors.h"
 
 static struct kmem_cache *btrfs_path_cachep;
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index af6b72c7e33f..296ca4de8d52 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -52,6 +52,8 @@ struct btrfs_balance_control;
 struct btrfs_delayed_root;
 struct reloc_control;
 
+struct btrfs_map_token;
+
 #define BTRFS_OLDEST_GENERATION	0ULL
 
 #define BTRFS_EMPTY_DIR_SIZE 0
@@ -1199,23 +1201,9 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
 	return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item);
 }
 
-struct btrfs_map_token {
-	struct extent_buffer *eb;
-	char *kaddr;
-	unsigned long offset;
-};
-
 #define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \
 				((bytes) >> (fs_info)->sectorsize_bits)
 
-static inline void btrfs_init_map_token(struct btrfs_map_token *token,
-					struct extent_buffer *eb)
-{
-	token->eb = eb;
-	token->kaddr = page_address(eb->pages[0]);
-	token->offset = 0;
-}
-
 /* some macros to generate set/get functions for the struct fields.  This
  * assumes there is a lefoo_to_cpu for every type, so lets make a simple
  * one for u8:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 60b3162c035d..cb2a6d7f6252 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -56,6 +56,7 @@
 #include "subpage.h"
 #include "inode-item.h"
 #include "fs.h"
+#include "accessors.h"
 
 struct btrfs_iget_args {
 	u64 ino;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index e294c38f9b19..dc49f0aae1fb 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -22,6 +22,7 @@
 #include "zoned.h"
 #include "inode-item.h"
 #include "fs.h"
+#include "accessors.h"
 
 #define MAX_CONFLICT_INODES 10
 
-- 
cgit 


From 07e81dc94474eb62705c6f96d9ab1a5a797b8703 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 19 Oct 2022 10:51:00 -0400
Subject: btrfs: move accessor helpers into accessors.h

This is a large patch, but because they're all macros it's impossible to
split up.  Simply copy all of the item accessors in ctree.h and paste
them in accessors.h, and then update any files to include the header so
everything compiles.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ reformat comments, style fixups ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/accessors.h                   | 1068 +++++++++++++++++++++++++++++++
 fs/btrfs/backref.c                     |    1 +
 fs/btrfs/block-group.c                 |    1 +
 fs/btrfs/block-rsv.c                   |    1 +
 fs/btrfs/check-integrity.c             |    1 +
 fs/btrfs/ctree.h                       | 1090 --------------------------------
 fs/btrfs/delayed-inode.c               |    1 +
 fs/btrfs/dev-replace.c                 |    1 +
 fs/btrfs/dir-item.c                    |    1 +
 fs/btrfs/disk-io.c                     |    1 +
 fs/btrfs/export.c                      |    1 +
 fs/btrfs/extent-tree.c                 |    1 +
 fs/btrfs/extent_io.c                   |    1 +
 fs/btrfs/file-item.c                   |    1 +
 fs/btrfs/file.c                        |    1 +
 fs/btrfs/free-space-cache.c            |    1 +
 fs/btrfs/free-space-tree.c             |    1 +
 fs/btrfs/fs.c                          |    1 +
 fs/btrfs/inode-item.c                  |    1 +
 fs/btrfs/ioctl.c                       |    1 +
 fs/btrfs/locking.c                     |    1 +
 fs/btrfs/print-tree.c                  |    1 +
 fs/btrfs/props.c                       |    1 +
 fs/btrfs/qgroup.c                      |    1 +
 fs/btrfs/ref-verify.c                  |    1 +
 fs/btrfs/reflink.c                     |    1 +
 fs/btrfs/relocation.c                  |    1 +
 fs/btrfs/root-tree.c                   |    1 +
 fs/btrfs/scrub.c                       |    1 +
 fs/btrfs/send.c                        |    1 +
 fs/btrfs/space-info.c                  |    1 +
 fs/btrfs/super.c                       |    1 +
 fs/btrfs/sysfs.c                       |    1 +
 fs/btrfs/tests/extent-buffer-tests.c   |    1 +
 fs/btrfs/tests/free-space-tree-tests.c |    1 +
 fs/btrfs/tests/inode-tests.c           |    1 +
 fs/btrfs/tests/qgroup-tests.c          |    1 +
 fs/btrfs/transaction.c                 |    1 +
 fs/btrfs/tree-checker.c                |    1 +
 fs/btrfs/tree-defrag.c                 |    1 +
 fs/btrfs/tree-mod-log.c                |    1 +
 fs/btrfs/uuid-tree.c                   |    2 +-
 fs/btrfs/verity.c                      |    1 +
 fs/btrfs/volumes.c                     |    1 +
 fs/btrfs/xattr.c                       |    1 +
 fs/btrfs/zoned.c                       |    1 +
 46 files changed, 1112 insertions(+), 1091 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index a47b0f9d30ca..e8bbd4d80161 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -9,6 +9,1074 @@ struct btrfs_map_token {
 	unsigned long offset;
 };
 
+#define BTRFS_LEAF_DATA_OFFSET		offsetof(struct btrfs_leaf, items)
+
 void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb);
 
+/*
+ * Some macros to generate set/get functions for the struct fields.  This
+ * assumes there is a lefoo_to_cpu for every type, so lets make a simple one
+ * for u8:
+ */
+#define le8_to_cpu(v) (v)
+#define cpu_to_le8(v) (v)
+#define __le8 u8
+
+static inline u8 get_unaligned_le8(const void *p)
+{
+       return *(u8 *)p;
+}
+
+static inline void put_unaligned_le8(u8 val, void *p)
+{
+       *(u8 *)p = val;
+}
+
+#define read_eb_member(eb, ptr, type, member, result) (\
+	read_extent_buffer(eb, (char *)(result),			\
+			   ((unsigned long)(ptr)) +			\
+			    offsetof(type, member),			\
+			   sizeof(((type *)0)->member)))
+
+#define write_eb_member(eb, ptr, type, member, result) (\
+	write_extent_buffer(eb, (char *)(result),			\
+			   ((unsigned long)(ptr)) +			\
+			    offsetof(type, member),			\
+			   sizeof(((type *)0)->member)))
+
+#define DECLARE_BTRFS_SETGET_BITS(bits)					\
+u##bits btrfs_get_token_##bits(struct btrfs_map_token *token,		\
+			       const void *ptr, unsigned long off);	\
+void btrfs_set_token_##bits(struct btrfs_map_token *token,		\
+			    const void *ptr, unsigned long off,		\
+			    u##bits val);				\
+u##bits btrfs_get_##bits(const struct extent_buffer *eb,		\
+			 const void *ptr, unsigned long off);		\
+void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr,	\
+		      unsigned long off, u##bits val);
+
+DECLARE_BTRFS_SETGET_BITS(8)
+DECLARE_BTRFS_SETGET_BITS(16)
+DECLARE_BTRFS_SETGET_BITS(32)
+DECLARE_BTRFS_SETGET_BITS(64)
+
+#define BTRFS_SETGET_FUNCS(name, type, member, bits)			\
+static inline u##bits btrfs_##name(const struct extent_buffer *eb,	\
+				   const type *s)			\
+{									\
+	static_assert(sizeof(u##bits) == sizeof(((type *)0))->member);	\
+	return btrfs_get_##bits(eb, s, offsetof(type, member));		\
+}									\
+static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \
+				    u##bits val)			\
+{									\
+	static_assert(sizeof(u##bits) == sizeof(((type *)0))->member);	\
+	btrfs_set_##bits(eb, s, offsetof(type, member), val);		\
+}									\
+static inline u##bits btrfs_token_##name(struct btrfs_map_token *token,	\
+					 const type *s)			\
+{									\
+	static_assert(sizeof(u##bits) == sizeof(((type *)0))->member);	\
+	return btrfs_get_token_##bits(token, s, offsetof(type, member));\
+}									\
+static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\
+					  type *s, u##bits val)		\
+{									\
+	static_assert(sizeof(u##bits) == sizeof(((type *)0))->member);	\
+	btrfs_set_token_##bits(token, s, offsetof(type, member), val);	\
+}
+
+#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits)		\
+static inline u##bits btrfs_##name(const struct extent_buffer *eb)	\
+{									\
+	const type *p = page_address(eb->pages[0]) +			\
+			offset_in_page(eb->start);			\
+	return get_unaligned_le##bits(&p->member);			\
+}									\
+static inline void btrfs_set_##name(const struct extent_buffer *eb,	\
+				    u##bits val)			\
+{									\
+	type *p = page_address(eb->pages[0]) + offset_in_page(eb->start); \
+	put_unaligned_le##bits(val, &p->member);			\
+}
+
+#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits)		\
+static inline u##bits btrfs_##name(const type *s)			\
+{									\
+	return get_unaligned_le##bits(&s->member);			\
+}									\
+static inline void btrfs_set_##name(type *s, u##bits val)		\
+{									\
+	put_unaligned_le##bits(val, &s->member);			\
+}
+
+static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb,
+					   struct btrfs_dev_item *s)
+{
+	static_assert(sizeof(u64) ==
+		      sizeof(((struct btrfs_dev_item *)0))->total_bytes);
+	return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item,
+					    total_bytes));
+}
+static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb,
+						struct btrfs_dev_item *s,
+						u64 val)
+{
+	static_assert(sizeof(u64) ==
+		      sizeof(((struct btrfs_dev_item *)0))->total_bytes);
+	WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize));
+	btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val);
+}
+
+BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64);
+BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64);
+BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
+BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
+BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item, start_offset, 64);
+BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32);
+BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
+BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32);
+BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8);
+BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8);
+BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item,
+			 total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item,
+			 bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item,
+			 io_align, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item,
+			 io_width, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item,
+			 sector_size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item, dev_group, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item,
+			 seek_speed, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item,
+			 bandwidth, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item,
+			 generation, 64);
+
+static inline unsigned long btrfs_device_uuid(struct btrfs_dev_item *d)
+{
+	return (unsigned long)d + offsetof(struct btrfs_dev_item, uuid);
+}
+
+static inline unsigned long btrfs_device_fsid(struct btrfs_dev_item *d)
+{
+	return (unsigned long)d + offsetof(struct btrfs_dev_item, fsid);
+}
+
+BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64);
+BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64);
+BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64);
+BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32);
+BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32);
+BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32);
+BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64);
+BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16);
+BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16);
+BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64);
+BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64);
+
+static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s)
+{
+	return (char *)s + offsetof(struct btrfs_stripe, dev_uuid);
+}
+
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk,
+			 stripe_len, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk, io_align, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk, io_width, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk,
+			 sector_size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk,
+			 num_stripes, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk,
+			 sub_stripes, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64);
+
+static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c, int nr)
+{
+	unsigned long offset = (unsigned long)c;
+
+	offset += offsetof(struct btrfs_chunk, stripe);
+	offset += nr * sizeof(struct btrfs_stripe);
+	return (struct btrfs_stripe *)offset;
+}
+
+static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr)
+{
+	return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr));
+}
+
+static inline u64 btrfs_stripe_offset_nr(const struct extent_buffer *eb,
+					 struct btrfs_chunk *c, int nr)
+{
+	return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr));
+}
+
+static inline u64 btrfs_stripe_devid_nr(const struct extent_buffer *eb,
+					 struct btrfs_chunk *c, int nr)
+{
+	return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr));
+}
+
+/* struct btrfs_block_group_item */
+BTRFS_SETGET_STACK_FUNCS(stack_block_group_used, struct btrfs_block_group_item,
+			 used, 64);
+BTRFS_SETGET_FUNCS(block_group_used, struct btrfs_block_group_item, used, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_block_group_chunk_objectid,
+			struct btrfs_block_group_item, chunk_objectid, 64);
+
+BTRFS_SETGET_FUNCS(block_group_chunk_objectid,
+		   struct btrfs_block_group_item, chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(block_group_flags, struct btrfs_block_group_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_block_group_flags,
+			struct btrfs_block_group_item, flags, 64);
+
+/* struct btrfs_free_space_info */
+BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info,
+		   extent_count, 32);
+BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32);
+
+/* struct btrfs_inode_ref */
+BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
+BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
+
+/* struct btrfs_inode_extref */
+BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref,
+		   parent_objectid, 64);
+BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref,
+		   name_len, 16);
+BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64);
+
+/* struct btrfs_inode_item */
+BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
+BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
+BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64);
+BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64);
+BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64);
+BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64);
+BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32);
+BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32);
+BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32);
+BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32);
+BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64);
+BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item,
+			 generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item,
+			 sequence, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item,
+			 transid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item, nbytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item,
+			 block_group, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64);
+BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
+BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
+
+/* struct btrfs_dev_extent */
+BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, chunk_tree, 64);
+BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
+		   chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent,
+		   chunk_offset, 64);
+BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64);
+BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64);
+BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item, generation, 64);
+BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64);
+
+BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8);
+
+static inline void btrfs_tree_block_key(const struct extent_buffer *eb,
+					struct btrfs_tree_block_info *item,
+					struct btrfs_disk_key *key)
+{
+	read_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
+}
+
+static inline void btrfs_set_tree_block_key(const struct extent_buffer *eb,
+					    struct btrfs_tree_block_info *item,
+					    struct btrfs_disk_key *key)
+{
+	write_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
+}
+
+BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref, root, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref,
+		   objectid, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref,
+		   offset, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref, count, 32);
+
+BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref, count, 32);
+
+BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref,
+		   type, 8);
+BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref,
+		   offset, 64);
+
+static inline u32 btrfs_extent_inline_ref_size(int type)
+{
+	if (type == BTRFS_TREE_BLOCK_REF_KEY ||
+	    type == BTRFS_SHARED_BLOCK_REF_KEY)
+		return sizeof(struct btrfs_extent_inline_ref);
+	if (type == BTRFS_SHARED_DATA_REF_KEY)
+		return sizeof(struct btrfs_shared_data_ref) +
+		       sizeof(struct btrfs_extent_inline_ref);
+	if (type == BTRFS_EXTENT_DATA_REF_KEY)
+		return sizeof(struct btrfs_extent_data_ref) +
+		       offsetof(struct btrfs_extent_inline_ref, offset);
+	return 0;
+}
+
+/* struct btrfs_node */
+BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
+BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_key_blockptr, struct btrfs_key_ptr, blockptr, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_key_generation, struct btrfs_key_ptr,
+			 generation, 64);
+
+static inline u64 btrfs_node_blockptr(const struct extent_buffer *eb, int nr)
+{
+	unsigned long ptr;
+
+	ptr = offsetof(struct btrfs_node, ptrs) +
+		sizeof(struct btrfs_key_ptr) * nr;
+	return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr);
+}
+
+static inline void btrfs_set_node_blockptr(const struct extent_buffer *eb,
+					   int nr, u64 val)
+{
+	unsigned long ptr;
+
+	ptr = offsetof(struct btrfs_node, ptrs) +
+		sizeof(struct btrfs_key_ptr) * nr;
+	btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val);
+}
+
+static inline u64 btrfs_node_ptr_generation(const struct extent_buffer *eb, int nr)
+{
+	unsigned long ptr;
+
+	ptr = offsetof(struct btrfs_node, ptrs) +
+		sizeof(struct btrfs_key_ptr) * nr;
+	return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr);
+}
+
+static inline void btrfs_set_node_ptr_generation(const struct extent_buffer *eb,
+						 int nr, u64 val)
+{
+	unsigned long ptr;
+
+	ptr = offsetof(struct btrfs_node, ptrs) +
+		sizeof(struct btrfs_key_ptr) * nr;
+	btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val);
+}
+
+static inline unsigned long btrfs_node_key_ptr_offset(int nr)
+{
+	return offsetof(struct btrfs_node, ptrs) +
+		sizeof(struct btrfs_key_ptr) * nr;
+}
+
+void btrfs_node_key(const struct extent_buffer *eb,
+		    struct btrfs_disk_key *disk_key, int nr);
+
+static inline void btrfs_set_node_key(const struct extent_buffer *eb,
+				      struct btrfs_disk_key *disk_key, int nr)
+{
+	unsigned long ptr;
+
+	ptr = btrfs_node_key_ptr_offset(nr);
+	write_eb_member(eb, (struct btrfs_key_ptr *)ptr,
+		        struct btrfs_key_ptr, key, disk_key);
+}
+
+/* struct btrfs_item */
+BTRFS_SETGET_FUNCS(raw_item_offset, struct btrfs_item, offset, 32);
+BTRFS_SETGET_FUNCS(raw_item_size, struct btrfs_item, size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32);
+
+static inline unsigned long btrfs_item_nr_offset(int nr)
+{
+	return offsetof(struct btrfs_leaf, items) +
+		sizeof(struct btrfs_item) * nr;
+}
+
+static inline struct btrfs_item *btrfs_item_nr(int nr)
+{
+	return (struct btrfs_item *)btrfs_item_nr_offset(nr);
+}
+
+#define BTRFS_ITEM_SETGET_FUNCS(member)						\
+static inline u32 btrfs_item_##member(const struct extent_buffer *eb, int slot)	\
+{										\
+	return btrfs_raw_item_##member(eb, btrfs_item_nr(slot));		\
+}										\
+static inline void btrfs_set_item_##member(const struct extent_buffer *eb,	\
+					   int slot, u32 val)			\
+{										\
+	btrfs_set_raw_item_##member(eb, btrfs_item_nr(slot), val);		\
+}										\
+static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token,	\
+					    int slot)				\
+{										\
+	struct btrfs_item *item = btrfs_item_nr(slot);				\
+	return btrfs_token_raw_item_##member(token, item);			\
+}										\
+static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token,	\
+						 int slot, u32 val)		\
+{										\
+	struct btrfs_item *item = btrfs_item_nr(slot);				\
+	btrfs_set_token_raw_item_##member(token, item, val);			\
+}
+
+BTRFS_ITEM_SETGET_FUNCS(offset)
+BTRFS_ITEM_SETGET_FUNCS(size);
+
+static inline u32 btrfs_item_data_end(const struct extent_buffer *eb, int nr)
+{
+	return btrfs_item_offset(eb, nr) + btrfs_item_size(eb, nr);
+}
+
+static inline void btrfs_item_key(const struct extent_buffer *eb,
+			   struct btrfs_disk_key *disk_key, int nr)
+{
+	struct btrfs_item *item = btrfs_item_nr(nr);
+
+	read_eb_member(eb, item, struct btrfs_item, key, disk_key);
+}
+
+static inline void btrfs_set_item_key(struct extent_buffer *eb,
+				      struct btrfs_disk_key *disk_key, int nr)
+{
+	struct btrfs_item *item = btrfs_item_nr(nr);
+
+	write_eb_member(eb, item, struct btrfs_item, key, disk_key);
+}
+
+BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64);
+
+/* struct btrfs_root_ref */
+BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64);
+BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64);
+BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16);
+
+/* struct btrfs_dir_item */
+BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16);
+BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8);
+BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16);
+BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_type, struct btrfs_dir_item, type, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_data_len, struct btrfs_dir_item, data_len, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item, name_len, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item, transid, 64);
+
+static inline void btrfs_dir_item_key(const struct extent_buffer *eb,
+				      const struct btrfs_dir_item *item,
+				      struct btrfs_disk_key *key)
+{
+	read_eb_member(eb, item, struct btrfs_dir_item, location, key);
+}
+
+static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
+					  struct btrfs_dir_item *item,
+					  const struct btrfs_disk_key *key)
+{
+	write_eb_member(eb, item, struct btrfs_dir_item, location, key);
+}
+
+BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header,
+		   num_entries, 64);
+BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header,
+		   num_bitmaps, 64);
+BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header,
+		   generation, 64);
+
+static inline void btrfs_free_space_key(const struct extent_buffer *eb,
+					const struct btrfs_free_space_header *h,
+					struct btrfs_disk_key *key)
+{
+	read_eb_member(eb, h, struct btrfs_free_space_header, location, key);
+}
+
+static inline void btrfs_set_free_space_key(struct extent_buffer *eb,
+					    struct btrfs_free_space_header *h,
+					    const struct btrfs_disk_key *key)
+{
+	write_eb_member(eb, h, struct btrfs_free_space_header, location, key);
+}
+
+/* struct btrfs_disk_key */
+BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key, objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64);
+BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8);
+
+#ifdef __LITTLE_ENDIAN
+
+/*
+ * Optimized helpers for little-endian architectures where CPU and on-disk
+ * structures have the same endianness and we can skip conversions.
+ */
+
+static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu_key,
+					 const struct btrfs_disk_key *disk_key)
+{
+	memcpy(cpu_key, disk_key, sizeof(struct btrfs_key));
+}
+
+static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk_key,
+					 const struct btrfs_key *cpu_key)
+{
+	memcpy(disk_key, cpu_key, sizeof(struct btrfs_key));
+}
+
+static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb,
+					 struct btrfs_key *cpu_key, int nr)
+{
+	struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;
+
+	btrfs_node_key(eb, disk_key, nr);
+}
+
+static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb,
+					 struct btrfs_key *cpu_key, int nr)
+{
+	struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;
+
+	btrfs_item_key(eb, disk_key, nr);
+}
+
+static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb,
+					     const struct btrfs_dir_item *item,
+					     struct btrfs_key *cpu_key)
+{
+	struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;
+
+	btrfs_dir_item_key(eb, item, disk_key);
+}
+
+#else
+
+static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu,
+					 const struct btrfs_disk_key *disk)
+{
+	cpu->offset = le64_to_cpu(disk->offset);
+	cpu->type = disk->type;
+	cpu->objectid = le64_to_cpu(disk->objectid);
+}
+
+static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk,
+					 const struct btrfs_key *cpu)
+{
+	disk->offset = cpu_to_le64(cpu->offset);
+	disk->type = cpu->type;
+	disk->objectid = cpu_to_le64(cpu->objectid);
+}
+
+static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb,
+					 struct btrfs_key *key, int nr)
+{
+	struct btrfs_disk_key disk_key;
+
+	btrfs_node_key(eb, &disk_key, nr);
+	btrfs_disk_key_to_cpu(key, &disk_key);
+}
+
+static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb,
+					 struct btrfs_key *key, int nr)
+{
+	struct btrfs_disk_key disk_key;
+
+	btrfs_item_key(eb, &disk_key, nr);
+	btrfs_disk_key_to_cpu(key, &disk_key);
+}
+
+static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb,
+					     const struct btrfs_dir_item *item,
+					     struct btrfs_key *key)
+{
+	struct btrfs_disk_key disk_key;
+
+	btrfs_dir_item_key(eb, item, &disk_key);
+	btrfs_disk_key_to_cpu(key, &disk_key);
+}
+
+#endif
+
+/* struct btrfs_header */
+BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header, generation, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32);
+BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_header_generation, struct btrfs_header,
+			 generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_header_owner, struct btrfs_header, owner, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_header_nritems, struct btrfs_header, nritems, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_header_bytenr, struct btrfs_header, bytenr, 64);
+
+static inline int btrfs_header_flag(const struct extent_buffer *eb, u64 flag)
+{
+	return (btrfs_header_flags(eb) & flag) == flag;
+}
+
+static inline void btrfs_set_header_flag(struct extent_buffer *eb, u64 flag)
+{
+	u64 flags = btrfs_header_flags(eb);
+
+	btrfs_set_header_flags(eb, flags | flag);
+}
+
+static inline void btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
+{
+	u64 flags = btrfs_header_flags(eb);
+
+	btrfs_set_header_flags(eb, flags & ~flag);
+}
+
+static inline int btrfs_header_backref_rev(const struct extent_buffer *eb)
+{
+	u64 flags = btrfs_header_flags(eb);
+
+	return flags >> BTRFS_BACKREF_REV_SHIFT;
+}
+
+static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb, int rev)
+{
+	u64 flags = btrfs_header_flags(eb);
+
+	flags &= ~BTRFS_BACKREF_REV_MASK;
+	flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT;
+	btrfs_set_header_flags(eb, flags);
+}
+
+static inline int btrfs_is_leaf(const struct extent_buffer *eb)
+{
+	return btrfs_header_level(eb) == 0;
+}
+
+/* struct btrfs_root_item */
+BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item, generation, 64);
+BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32);
+BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64);
+BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item, generation, 64);
+BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(root_drop_level, struct btrfs_root_item, drop_level, 8);
+BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8);
+BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32);
+BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
+BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
+			 last_snapshot, 64);
+BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item,
+			 generation_v2, 64);
+BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item, ctransid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item, otransid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item, stransid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item, rtransid, 64);
+
+static inline bool btrfs_root_readonly(const struct btrfs_root *root)
+{
+	/* Byte-swap the constant at compile time, root_item::flags is LE */
+	return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0;
+}
+
+static inline bool btrfs_root_dead(const struct btrfs_root *root)
+{
+	/* Byte-swap the constant at compile time, root_item::flags is LE */
+	return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0;
+}
+
+static inline u64 btrfs_root_id(const struct btrfs_root *root)
+{
+	return root->root_key.objectid;
+}
+
+/* struct btrfs_root_backup */
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
+		   tree_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup,
+		   tree_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup,
+		   tree_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup,
+		   chunk_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup,
+		   chunk_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup,
+		   chunk_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup,
+		   extent_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup,
+		   extent_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup,
+		   extent_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup,
+		   fs_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup,
+		   fs_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup,
+		   fs_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup,
+		   dev_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup,
+		   dev_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup,
+		   dev_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup,
+		   csum_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup,
+		   csum_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup,
+		   csum_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup,
+		   total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
+		   bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
+		   num_devices, 64);
+
+/* struct btrfs_balance_item */
+BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
+
+static inline void btrfs_balance_data(const struct extent_buffer *eb,
+				      const struct btrfs_balance_item *bi,
+				      struct btrfs_disk_balance_args *ba)
+{
+	read_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_set_balance_data(struct extent_buffer *eb,
+					  struct btrfs_balance_item *bi,
+					  const struct btrfs_disk_balance_args *ba)
+{
+	write_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_balance_meta(const struct extent_buffer *eb,
+				      const struct btrfs_balance_item *bi,
+				      struct btrfs_disk_balance_args *ba)
+{
+	read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+
+static inline void btrfs_set_balance_meta(struct extent_buffer *eb,
+					  struct btrfs_balance_item *bi,
+					  const struct btrfs_disk_balance_args *ba)
+{
+	write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+
+static inline void btrfs_balance_sys(const struct extent_buffer *eb,
+				     const struct btrfs_balance_item *bi,
+				     struct btrfs_disk_balance_args *ba)
+{
+	read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+
+static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
+					 struct btrfs_balance_item *bi,
+					 const struct btrfs_disk_balance_args *ba)
+{
+	write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+
+static inline void btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
+			       const struct btrfs_disk_balance_args *disk)
+{
+	memset(cpu, 0, sizeof(*cpu));
+
+	cpu->profiles = le64_to_cpu(disk->profiles);
+	cpu->usage = le64_to_cpu(disk->usage);
+	cpu->devid = le64_to_cpu(disk->devid);
+	cpu->pstart = le64_to_cpu(disk->pstart);
+	cpu->pend = le64_to_cpu(disk->pend);
+	cpu->vstart = le64_to_cpu(disk->vstart);
+	cpu->vend = le64_to_cpu(disk->vend);
+	cpu->target = le64_to_cpu(disk->target);
+	cpu->flags = le64_to_cpu(disk->flags);
+	cpu->limit = le64_to_cpu(disk->limit);
+	cpu->stripes_min = le32_to_cpu(disk->stripes_min);
+	cpu->stripes_max = le32_to_cpu(disk->stripes_max);
+}
+
+static inline void btrfs_cpu_balance_args_to_disk(
+				struct btrfs_disk_balance_args *disk,
+				const struct btrfs_balance_args *cpu)
+{
+	memset(disk, 0, sizeof(*disk));
+
+	disk->profiles = cpu_to_le64(cpu->profiles);
+	disk->usage = cpu_to_le64(cpu->usage);
+	disk->devid = cpu_to_le64(cpu->devid);
+	disk->pstart = cpu_to_le64(cpu->pstart);
+	disk->pend = cpu_to_le64(cpu->pend);
+	disk->vstart = cpu_to_le64(cpu->vstart);
+	disk->vend = cpu_to_le64(cpu->vend);
+	disk->target = cpu_to_le64(cpu->target);
+	disk->flags = cpu_to_le64(cpu->flags);
+	disk->limit = cpu_to_le64(cpu->limit);
+	disk->stripes_min = cpu_to_le32(cpu->stripes_min);
+	disk->stripes_max = cpu_to_le32(cpu->stripes_max);
+}
+
+/* struct btrfs_super_block */
+BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
+			 generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_sys_array_size,
+			 struct btrfs_super_block, sys_chunk_array_size, 32);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation,
+			 struct btrfs_super_block, chunk_root_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block,
+			 root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block,
+			 chunk_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
+			 chunk_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block, log_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block,
+			 log_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
+			 total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block,
+			 bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block,
+			 sectorsize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block,
+			 nodesize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block,
+			 stripesize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
+			 root_dir_objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
+			 num_devices, 64);
+BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block,
+			 compat_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block,
+			 compat_ro_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
+			 incompat_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
+			 csum_type, 16);
+BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block,
+			 cache_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64);
+BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
+			 uuid_tree_generation, 64);
+
+int btrfs_super_csum_size(const struct btrfs_super_block *s);
+const char *btrfs_super_csum_name(u16 csum_type);
+const char *btrfs_super_csum_driver(u16 csum_type);
+size_t __attribute_const__ btrfs_get_num_csums(void);
+
+/*
+ * The leaf data grows from end-to-front in the node.  this returns the address
+ * of the start of the last item, which is the stop of the leaf data stack.
+ */
+static inline unsigned int leaf_data_end(const struct extent_buffer *leaf)
+{
+	u32 nr = btrfs_header_nritems(leaf);
+
+	if (nr == 0)
+		return BTRFS_LEAF_DATA_SIZE(leaf->fs_info);
+	return btrfs_item_offset(leaf, nr - 1);
+}
+
+/* struct btrfs_file_extent_item */
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item,
+			 type, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr,
+			 struct btrfs_file_extent_item, disk_bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset,
+			 struct btrfs_file_extent_item, offset, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation,
+			 struct btrfs_file_extent_item, generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes,
+			 struct btrfs_file_extent_item, num_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_ram_bytes,
+			 struct btrfs_file_extent_item, ram_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes,
+			 struct btrfs_file_extent_item, disk_num_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression,
+			 struct btrfs_file_extent_item, compression, 8);
+
+static inline unsigned long btrfs_file_extent_inline_start(
+				const struct btrfs_file_extent_item *e)
+{
+	return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START;
+}
+
+static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
+{
+	return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize;
+}
+
+BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
+BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
+		   disk_bytenr, 64);
+BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
+		   generation, 64);
+BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item,
+		   disk_num_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item,
+		  offset, 64);
+BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item,
+		   num_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item,
+		   ram_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item,
+		   compression, 8);
+BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
+		   encryption, 8);
+BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
+		   other_encoding, 16);
+
+/*
+ * Returns the number of bytes used by the item on disk, minus the size of any
+ * extent headers.  If a file is compressed on disk, this is the compressed
+ * size.
+ */
+static inline u32 btrfs_file_extent_inline_item_len(
+						const struct extent_buffer *eb,
+						int nr)
+{
+	return btrfs_item_size(eb, nr) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
+}
+
+/* btrfs_qgroup_status_item */
+BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item,
+		   generation, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item,
+		   version, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item,
+		   flags, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item,
+		   rescan, 64);
+
+/* btrfs_qgroup_info_item */
+BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item,
+		   generation, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item,
+		   rfer_cmpr, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item,
+		   excl_cmpr, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation,
+			 struct btrfs_qgroup_info_item, generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item,
+			 rfer, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr,
+			 struct btrfs_qgroup_info_item, rfer_cmpr, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item,
+			 excl, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr,
+			 struct btrfs_qgroup_info_item, excl_cmpr, 64);
+
+/* btrfs_qgroup_limit_item */
+BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item, flags, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item,
+		   max_rfer, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item,
+		   max_excl, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item,
+		   rsv_rfer, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item,
+		   rsv_excl, 64);
+
+/* btrfs_dev_replace_item */
+BTRFS_SETGET_FUNCS(dev_replace_src_devid,
+		   struct btrfs_dev_replace_item, src_devid, 64);
+BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode,
+		   struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode,
+		   64);
+BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item,
+		   replace_state, 64);
+BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item,
+		   time_started, 64);
+BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item,
+		   time_stopped, 64);
+BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item,
+		   num_write_errors, 64);
+BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors,
+		   struct btrfs_dev_replace_item, num_uncorrectable_read_errors,
+		   64);
+BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item,
+		   cursor_left, 64);
+BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item,
+		   cursor_right, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid,
+			 struct btrfs_dev_replace_item, src_devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode,
+			 struct btrfs_dev_replace_item,
+			 cont_reading_from_srcdev_mode, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state,
+			 struct btrfs_dev_replace_item, replace_state, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started,
+			 struct btrfs_dev_replace_item, time_started, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped,
+			 struct btrfs_dev_replace_item, time_stopped, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors,
+			 struct btrfs_dev_replace_item, num_write_errors, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors,
+			 struct btrfs_dev_replace_item,
+			 num_uncorrectable_read_errors, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left,
+			 struct btrfs_dev_replace_item, cursor_left, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
+			 struct btrfs_dev_replace_item, cursor_right, 64);
+
+/* btrfs_verity_descriptor_item */
+BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item,
+		   encryption, 8);
+BTRFS_SETGET_FUNCS(verity_descriptor_size, struct btrfs_verity_descriptor_item,
+		   size, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption,
+			 struct btrfs_verity_descriptor_item, encryption, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size,
+			 struct btrfs_verity_descriptor_item, size, 64);
+
+/* Cast into the data area of the leaf. */
+#define btrfs_item_ptr(leaf, slot, type)				\
+	((type *)(BTRFS_LEAF_DATA_OFFSET + btrfs_item_offset(leaf, slot)))
+
+#define btrfs_item_ptr_offset(leaf, slot)				\
+	((unsigned long)(BTRFS_LEAF_DATA_OFFSET + btrfs_item_offset(leaf, slot)))
+
 #endif
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index f76db317c437..41caa6fc6301 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -16,6 +16,7 @@
 #include "misc.h"
 #include "tree-mod-log.h"
 #include "fs.h"
+#include "accessors.h"
 
 /* Just arbitrary numbers so we can be sure one of these happened. */
 #define BACKREF_FOUND_SHARED     6
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index bf8d1e110136..d1f8d792b184 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -18,6 +18,7 @@
 #include "raid56.h"
 #include "zoned.h"
 #include "fs.h"
+#include "accessors.h"
 
 #ifdef CONFIG_BTRFS_DEBUG
 int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group)
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index a48fa7ac74cc..069ea9d6a8d4 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -8,6 +8,7 @@
 #include "block-group.h"
 #include "disk-io.h"
 #include "fs.h"
+#include "accessors.h"
 
 /*
  * HOW DO BLOCK RESERVES WORK
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index e8e1a92b30ac..7ff0703ef3e4 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -93,6 +93,7 @@
 #include "check-integrity.h"
 #include "rcu-string.h"
 #include "compression.h"
+#include "accessors.h"
 
 #define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
 #define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 296ca4de8d52..20034c23abb2 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1176,8 +1176,6 @@ static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info)
 	return info->nodesize - sizeof(struct btrfs_header);
 }
 
-#define BTRFS_LEAF_DATA_OFFSET		offsetof(struct btrfs_leaf, items)
-
 static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_fs_info *info)
 {
 	return BTRFS_LEAF_DATA_SIZE(info) - sizeof(struct btrfs_item);
@@ -1204,1094 +1202,6 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
 #define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \
 				((bytes) >> (fs_info)->sectorsize_bits)
 
-/* some macros to generate set/get functions for the struct fields.  This
- * assumes there is a lefoo_to_cpu for every type, so lets make a simple
- * one for u8:
- */
-#define le8_to_cpu(v) (v)
-#define cpu_to_le8(v) (v)
-#define __le8 u8
-
-static inline u8 get_unaligned_le8(const void *p)
-{
-       return *(u8 *)p;
-}
-
-static inline void put_unaligned_le8(u8 val, void *p)
-{
-       *(u8 *)p = val;
-}
-
-#define read_eb_member(eb, ptr, type, member, result) (\
-	read_extent_buffer(eb, (char *)(result),			\
-			   ((unsigned long)(ptr)) +			\
-			    offsetof(type, member),			\
-			   sizeof(((type *)0)->member)))
-
-#define write_eb_member(eb, ptr, type, member, result) (\
-	write_extent_buffer(eb, (char *)(result),			\
-			   ((unsigned long)(ptr)) +			\
-			    offsetof(type, member),			\
-			   sizeof(((type *)0)->member)))
-
-#define DECLARE_BTRFS_SETGET_BITS(bits)					\
-u##bits btrfs_get_token_##bits(struct btrfs_map_token *token,		\
-			       const void *ptr, unsigned long off);	\
-void btrfs_set_token_##bits(struct btrfs_map_token *token,		\
-			    const void *ptr, unsigned long off,		\
-			    u##bits val);				\
-u##bits btrfs_get_##bits(const struct extent_buffer *eb,		\
-			 const void *ptr, unsigned long off);		\
-void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr,	\
-		      unsigned long off, u##bits val);
-
-DECLARE_BTRFS_SETGET_BITS(8)
-DECLARE_BTRFS_SETGET_BITS(16)
-DECLARE_BTRFS_SETGET_BITS(32)
-DECLARE_BTRFS_SETGET_BITS(64)
-
-#define BTRFS_SETGET_FUNCS(name, type, member, bits)			\
-static inline u##bits btrfs_##name(const struct extent_buffer *eb,	\
-				   const type *s)			\
-{									\
-	static_assert(sizeof(u##bits) == sizeof(((type *)0))->member);	\
-	return btrfs_get_##bits(eb, s, offsetof(type, member));		\
-}									\
-static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \
-				    u##bits val)			\
-{									\
-	static_assert(sizeof(u##bits) == sizeof(((type *)0))->member);	\
-	btrfs_set_##bits(eb, s, offsetof(type, member), val);		\
-}									\
-static inline u##bits btrfs_token_##name(struct btrfs_map_token *token,	\
-					 const type *s)			\
-{									\
-	static_assert(sizeof(u##bits) == sizeof(((type *)0))->member);	\
-	return btrfs_get_token_##bits(token, s, offsetof(type, member));\
-}									\
-static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\
-					  type *s, u##bits val)		\
-{									\
-	static_assert(sizeof(u##bits) == sizeof(((type *)0))->member);	\
-	btrfs_set_token_##bits(token, s, offsetof(type, member), val);	\
-}
-
-#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits)		\
-static inline u##bits btrfs_##name(const struct extent_buffer *eb)	\
-{									\
-	const type *p = page_address(eb->pages[0]) +			\
-			offset_in_page(eb->start);			\
-	return get_unaligned_le##bits(&p->member);			\
-}									\
-static inline void btrfs_set_##name(const struct extent_buffer *eb,	\
-				    u##bits val)			\
-{									\
-	type *p = page_address(eb->pages[0]) + offset_in_page(eb->start); \
-	put_unaligned_le##bits(val, &p->member);			\
-}
-
-#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits)		\
-static inline u##bits btrfs_##name(const type *s)			\
-{									\
-	return get_unaligned_le##bits(&s->member);			\
-}									\
-static inline void btrfs_set_##name(type *s, u##bits val)		\
-{									\
-	put_unaligned_le##bits(val, &s->member);			\
-}
-
-static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb,
-					   struct btrfs_dev_item *s)
-{
-	static_assert(sizeof(u64) ==
-		      sizeof(((struct btrfs_dev_item *)0))->total_bytes);
-	return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item,
-					    total_bytes));
-}
-static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb,
-						struct btrfs_dev_item *s,
-						u64 val)
-{
-	static_assert(sizeof(u64) ==
-		      sizeof(((struct btrfs_dev_item *)0))->total_bytes);
-	WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize));
-	btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val);
-}
-
-
-BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64);
-BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64);
-BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
-BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
-BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item,
-		   start_offset, 64);
-BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32);
-BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
-BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32);
-BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8);
-BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8);
-BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64);
-
-BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item,
-			 total_bytes, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item,
-			 bytes_used, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item,
-			 io_align, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item,
-			 io_width, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item,
-			 sector_size, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item,
-			 dev_group, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item,
-			 seek_speed, 8);
-BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item,
-			 bandwidth, 8);
-BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item,
-			 generation, 64);
-
-static inline unsigned long btrfs_device_uuid(struct btrfs_dev_item *d)
-{
-	return (unsigned long)d + offsetof(struct btrfs_dev_item, uuid);
-}
-
-static inline unsigned long btrfs_device_fsid(struct btrfs_dev_item *d)
-{
-	return (unsigned long)d + offsetof(struct btrfs_dev_item, fsid);
-}
-
-BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64);
-BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64);
-BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64);
-BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32);
-BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32);
-BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32);
-BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64);
-BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16);
-BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16);
-BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64);
-BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64);
-
-static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s)
-{
-	return (char *)s + offsetof(struct btrfs_stripe, dev_uuid);
-}
-
-BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk,
-			 stripe_len, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk,
-			 io_align, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk,
-			 io_width, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk,
-			 sector_size, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk,
-			 num_stripes, 16);
-BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk,
-			 sub_stripes, 16);
-BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64);
-
-static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c,
-						   int nr)
-{
-	unsigned long offset = (unsigned long)c;
-	offset += offsetof(struct btrfs_chunk, stripe);
-	offset += nr * sizeof(struct btrfs_stripe);
-	return (struct btrfs_stripe *)offset;
-}
-
-static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr)
-{
-	return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr));
-}
-
-static inline u64 btrfs_stripe_offset_nr(const struct extent_buffer *eb,
-					 struct btrfs_chunk *c, int nr)
-{
-	return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr));
-}
-
-static inline u64 btrfs_stripe_devid_nr(const struct extent_buffer *eb,
-					 struct btrfs_chunk *c, int nr)
-{
-	return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr));
-}
-
-/* struct btrfs_block_group_item */
-BTRFS_SETGET_STACK_FUNCS(stack_block_group_used, struct btrfs_block_group_item,
-			 used, 64);
-BTRFS_SETGET_FUNCS(block_group_used, struct btrfs_block_group_item,
-			 used, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_block_group_chunk_objectid,
-			struct btrfs_block_group_item, chunk_objectid, 64);
-
-BTRFS_SETGET_FUNCS(block_group_chunk_objectid,
-		   struct btrfs_block_group_item, chunk_objectid, 64);
-BTRFS_SETGET_FUNCS(block_group_flags,
-		   struct btrfs_block_group_item, flags, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_block_group_flags,
-			struct btrfs_block_group_item, flags, 64);
-
-/* struct btrfs_free_space_info */
-BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info,
-		   extent_count, 32);
-BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32);
-
-/* struct btrfs_inode_ref */
-BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
-BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
-
-/* struct btrfs_inode_extref */
-BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref,
-		   parent_objectid, 64);
-BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref,
-		   name_len, 16);
-BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64);
-
-/* struct btrfs_inode_item */
-BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
-BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
-BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64);
-BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64);
-BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64);
-BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64);
-BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32);
-BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32);
-BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32);
-BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32);
-BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64);
-BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item,
-			 generation, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item,
-			 sequence, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item,
-			 transid, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item,
-			 nbytes, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item,
-			 block_group, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64);
-BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
-BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
-
-/* struct btrfs_dev_extent */
-BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent,
-		   chunk_tree, 64);
-BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
-		   chunk_objectid, 64);
-BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent,
-		   chunk_offset, 64);
-BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64);
-BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64);
-BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item,
-		   generation, 64);
-BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64);
-
-BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8);
-
-static inline void btrfs_tree_block_key(const struct extent_buffer *eb,
-					struct btrfs_tree_block_info *item,
-					struct btrfs_disk_key *key)
-{
-	read_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
-}
-
-static inline void btrfs_set_tree_block_key(const struct extent_buffer *eb,
-					    struct btrfs_tree_block_info *item,
-					    struct btrfs_disk_key *key)
-{
-	write_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
-}
-
-BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref,
-		   root, 64);
-BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref,
-		   objectid, 64);
-BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref,
-		   offset, 64);
-BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref,
-		   count, 32);
-
-BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref,
-		   count, 32);
-
-BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref,
-		   type, 8);
-BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref,
-		   offset, 64);
-
-static inline u32 btrfs_extent_inline_ref_size(int type)
-{
-	if (type == BTRFS_TREE_BLOCK_REF_KEY ||
-	    type == BTRFS_SHARED_BLOCK_REF_KEY)
-		return sizeof(struct btrfs_extent_inline_ref);
-	if (type == BTRFS_SHARED_DATA_REF_KEY)
-		return sizeof(struct btrfs_shared_data_ref) +
-		       sizeof(struct btrfs_extent_inline_ref);
-	if (type == BTRFS_EXTENT_DATA_REF_KEY)
-		return sizeof(struct btrfs_extent_data_ref) +
-		       offsetof(struct btrfs_extent_inline_ref, offset);
-	return 0;
-}
-
-/* struct btrfs_node */
-BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
-BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_key_blockptr, struct btrfs_key_ptr,
-			 blockptr, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_key_generation, struct btrfs_key_ptr,
-			 generation, 64);
-
-static inline u64 btrfs_node_blockptr(const struct extent_buffer *eb, int nr)
-{
-	unsigned long ptr;
-	ptr = offsetof(struct btrfs_node, ptrs) +
-		sizeof(struct btrfs_key_ptr) * nr;
-	return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr);
-}
-
-static inline void btrfs_set_node_blockptr(const struct extent_buffer *eb,
-					   int nr, u64 val)
-{
-	unsigned long ptr;
-	ptr = offsetof(struct btrfs_node, ptrs) +
-		sizeof(struct btrfs_key_ptr) * nr;
-	btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val);
-}
-
-static inline u64 btrfs_node_ptr_generation(const struct extent_buffer *eb, int nr)
-{
-	unsigned long ptr;
-	ptr = offsetof(struct btrfs_node, ptrs) +
-		sizeof(struct btrfs_key_ptr) * nr;
-	return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr);
-}
-
-static inline void btrfs_set_node_ptr_generation(const struct extent_buffer *eb,
-						 int nr, u64 val)
-{
-	unsigned long ptr;
-	ptr = offsetof(struct btrfs_node, ptrs) +
-		sizeof(struct btrfs_key_ptr) * nr;
-	btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val);
-}
-
-static inline unsigned long btrfs_node_key_ptr_offset(int nr)
-{
-	return offsetof(struct btrfs_node, ptrs) +
-		sizeof(struct btrfs_key_ptr) * nr;
-}
-
-void btrfs_node_key(const struct extent_buffer *eb,
-		    struct btrfs_disk_key *disk_key, int nr);
-
-static inline void btrfs_set_node_key(const struct extent_buffer *eb,
-				      struct btrfs_disk_key *disk_key, int nr)
-{
-	unsigned long ptr;
-	ptr = btrfs_node_key_ptr_offset(nr);
-	write_eb_member(eb, (struct btrfs_key_ptr *)ptr,
-		       struct btrfs_key_ptr, key, disk_key);
-}
-
-/* struct btrfs_item */
-BTRFS_SETGET_FUNCS(raw_item_offset, struct btrfs_item, offset, 32);
-BTRFS_SETGET_FUNCS(raw_item_size, struct btrfs_item, size, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32);
-
-static inline unsigned long btrfs_item_nr_offset(int nr)
-{
-	return offsetof(struct btrfs_leaf, items) +
-		sizeof(struct btrfs_item) * nr;
-}
-
-static inline struct btrfs_item *btrfs_item_nr(int nr)
-{
-	return (struct btrfs_item *)btrfs_item_nr_offset(nr);
-}
-
-#define BTRFS_ITEM_SETGET_FUNCS(member)						\
-static inline u32 btrfs_item_##member(const struct extent_buffer *eb,		\
-				      int slot)					\
-{										\
-	return btrfs_raw_item_##member(eb, btrfs_item_nr(slot));		\
-}										\
-static inline void btrfs_set_item_##member(const struct extent_buffer *eb,	\
-					   int slot, u32 val)			\
-{										\
-	btrfs_set_raw_item_##member(eb, btrfs_item_nr(slot), val);		\
-}										\
-static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token,	\
-					    int slot)				\
-{										\
-	struct btrfs_item *item = btrfs_item_nr(slot);				\
-	return btrfs_token_raw_item_##member(token, item);			\
-}										\
-static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token,	\
-						 int slot, u32 val)		\
-{										\
-	struct btrfs_item *item = btrfs_item_nr(slot);				\
-	btrfs_set_token_raw_item_##member(token, item, val);			\
-}
-
-BTRFS_ITEM_SETGET_FUNCS(offset)
-BTRFS_ITEM_SETGET_FUNCS(size);
-
-static inline u32 btrfs_item_data_end(const struct extent_buffer *eb, int nr)
-{
-	return btrfs_item_offset(eb, nr) + btrfs_item_size(eb, nr);
-}
-
-static inline void btrfs_item_key(const struct extent_buffer *eb,
-			   struct btrfs_disk_key *disk_key, int nr)
-{
-	struct btrfs_item *item = btrfs_item_nr(nr);
-	read_eb_member(eb, item, struct btrfs_item, key, disk_key);
-}
-
-static inline void btrfs_set_item_key(struct extent_buffer *eb,
-			       struct btrfs_disk_key *disk_key, int nr)
-{
-	struct btrfs_item *item = btrfs_item_nr(nr);
-	write_eb_member(eb, item, struct btrfs_item, key, disk_key);
-}
-
-BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64);
-
-/*
- * struct btrfs_root_ref
- */
-BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64);
-BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64);
-BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16);
-
-/* struct btrfs_dir_item */
-BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16);
-BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8);
-BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16);
-BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_dir_type, struct btrfs_dir_item, type, 8);
-BTRFS_SETGET_STACK_FUNCS(stack_dir_data_len, struct btrfs_dir_item,
-			 data_len, 16);
-BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item,
-			 name_len, 16);
-BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item,
-			 transid, 64);
-
-static inline void btrfs_dir_item_key(const struct extent_buffer *eb,
-				      const struct btrfs_dir_item *item,
-				      struct btrfs_disk_key *key)
-{
-	read_eb_member(eb, item, struct btrfs_dir_item, location, key);
-}
-
-static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
-					  struct btrfs_dir_item *item,
-					  const struct btrfs_disk_key *key)
-{
-	write_eb_member(eb, item, struct btrfs_dir_item, location, key);
-}
-
-BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header,
-		   num_entries, 64);
-BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header,
-		   num_bitmaps, 64);
-BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header,
-		   generation, 64);
-
-static inline void btrfs_free_space_key(const struct extent_buffer *eb,
-					const struct btrfs_free_space_header *h,
-					struct btrfs_disk_key *key)
-{
-	read_eb_member(eb, h, struct btrfs_free_space_header, location, key);
-}
-
-static inline void btrfs_set_free_space_key(struct extent_buffer *eb,
-					    struct btrfs_free_space_header *h,
-					    const struct btrfs_disk_key *key)
-{
-	write_eb_member(eb, h, struct btrfs_free_space_header, location, key);
-}
-
-/* struct btrfs_disk_key */
-BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key,
-			 objectid, 64);
-BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64);
-BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8);
-
-#ifdef __LITTLE_ENDIAN
-
-/*
- * Optimized helpers for little-endian architectures where CPU and on-disk
- * structures have the same endianness and we can skip conversions.
- */
-
-static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu_key,
-					 const struct btrfs_disk_key *disk_key)
-{
-	memcpy(cpu_key, disk_key, sizeof(struct btrfs_key));
-}
-
-static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk_key,
-					 const struct btrfs_key *cpu_key)
-{
-	memcpy(disk_key, cpu_key, sizeof(struct btrfs_key));
-}
-
-static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb,
-					 struct btrfs_key *cpu_key, int nr)
-{
-	struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;
-
-	btrfs_node_key(eb, disk_key, nr);
-}
-
-static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb,
-					 struct btrfs_key *cpu_key, int nr)
-{
-	struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;
-
-	btrfs_item_key(eb, disk_key, nr);
-}
-
-static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb,
-					     const struct btrfs_dir_item *item,
-					     struct btrfs_key *cpu_key)
-{
-	struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;
-
-	btrfs_dir_item_key(eb, item, disk_key);
-}
-
-#else
-
-static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu,
-					 const struct btrfs_disk_key *disk)
-{
-	cpu->offset = le64_to_cpu(disk->offset);
-	cpu->type = disk->type;
-	cpu->objectid = le64_to_cpu(disk->objectid);
-}
-
-static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk,
-					 const struct btrfs_key *cpu)
-{
-	disk->offset = cpu_to_le64(cpu->offset);
-	disk->type = cpu->type;
-	disk->objectid = cpu_to_le64(cpu->objectid);
-}
-
-static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb,
-					 struct btrfs_key *key, int nr)
-{
-	struct btrfs_disk_key disk_key;
-	btrfs_node_key(eb, &disk_key, nr);
-	btrfs_disk_key_to_cpu(key, &disk_key);
-}
-
-static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb,
-					 struct btrfs_key *key, int nr)
-{
-	struct btrfs_disk_key disk_key;
-	btrfs_item_key(eb, &disk_key, nr);
-	btrfs_disk_key_to_cpu(key, &disk_key);
-}
-
-static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb,
-					     const struct btrfs_dir_item *item,
-					     struct btrfs_key *key)
-{
-	struct btrfs_disk_key disk_key;
-	btrfs_dir_item_key(eb, item, &disk_key);
-	btrfs_disk_key_to_cpu(key, &disk_key);
-}
-
-#endif
-
-/* struct btrfs_header */
-BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64);
-BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header,
-			  generation, 64);
-BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64);
-BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32);
-BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64);
-BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8);
-BTRFS_SETGET_STACK_FUNCS(stack_header_generation, struct btrfs_header,
-			 generation, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_header_owner, struct btrfs_header, owner, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_header_nritems, struct btrfs_header,
-			 nritems, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_header_bytenr, struct btrfs_header, bytenr, 64);
-
-static inline int btrfs_header_flag(const struct extent_buffer *eb, u64 flag)
-{
-	return (btrfs_header_flags(eb) & flag) == flag;
-}
-
-static inline void btrfs_set_header_flag(struct extent_buffer *eb, u64 flag)
-{
-	u64 flags = btrfs_header_flags(eb);
-	btrfs_set_header_flags(eb, flags | flag);
-}
-
-static inline void btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
-{
-	u64 flags = btrfs_header_flags(eb);
-	btrfs_set_header_flags(eb, flags & ~flag);
-}
-
-static inline int btrfs_header_backref_rev(const struct extent_buffer *eb)
-{
-	u64 flags = btrfs_header_flags(eb);
-	return flags >> BTRFS_BACKREF_REV_SHIFT;
-}
-
-static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb,
-						int rev)
-{
-	u64 flags = btrfs_header_flags(eb);
-	flags &= ~BTRFS_BACKREF_REV_MASK;
-	flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT;
-	btrfs_set_header_flags(eb, flags);
-}
-
-static inline int btrfs_is_leaf(const struct extent_buffer *eb)
-{
-	return btrfs_header_level(eb) == 0;
-}
-
-/* struct btrfs_root_item */
-BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item,
-		   generation, 64);
-BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32);
-BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64);
-BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8);
-
-BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item,
-			 generation, 64);
-BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64);
-BTRFS_SETGET_STACK_FUNCS(root_drop_level, struct btrfs_root_item, drop_level, 8);
-BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8);
-BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64);
-BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32);
-BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64);
-BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64);
-BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
-BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
-			 last_snapshot, 64);
-BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item,
-			 generation_v2, 64);
-BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item,
-			 ctransid, 64);
-BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item,
-			 otransid, 64);
-BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item,
-			 stransid, 64);
-BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item,
-			 rtransid, 64);
-
-static inline bool btrfs_root_readonly(const struct btrfs_root *root)
-{
-	/* Byte-swap the constant at compile time, root_item::flags is LE */
-	return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0;
-}
-
-static inline bool btrfs_root_dead(const struct btrfs_root *root)
-{
-	/* Byte-swap the constant at compile time, root_item::flags is LE */
-	return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0;
-}
-
-static inline u64 btrfs_root_id(const struct btrfs_root *root)
-{
-	return root->root_key.objectid;
-}
-
-/* struct btrfs_root_backup */
-BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
-		   tree_root, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup,
-		   tree_root_gen, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup,
-		   tree_root_level, 8);
-
-BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup,
-		   chunk_root, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup,
-		   chunk_root_gen, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup,
-		   chunk_root_level, 8);
-
-BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup,
-		   extent_root, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup,
-		   extent_root_gen, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup,
-		   extent_root_level, 8);
-
-BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup,
-		   fs_root, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup,
-		   fs_root_gen, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup,
-		   fs_root_level, 8);
-
-BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup,
-		   dev_root, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup,
-		   dev_root_gen, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup,
-		   dev_root_level, 8);
-
-BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup,
-		   csum_root, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup,
-		   csum_root_gen, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup,
-		   csum_root_level, 8);
-BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup,
-		   total_bytes, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
-		   bytes_used, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
-		   num_devices, 64);
-
-/* struct btrfs_balance_item */
-BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
-
-static inline void btrfs_balance_data(const struct extent_buffer *eb,
-				      const struct btrfs_balance_item *bi,
-				      struct btrfs_disk_balance_args *ba)
-{
-	read_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
-}
-
-static inline void btrfs_set_balance_data(struct extent_buffer *eb,
-				  struct btrfs_balance_item *bi,
-				  const struct btrfs_disk_balance_args *ba)
-{
-	write_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
-}
-
-static inline void btrfs_balance_meta(const struct extent_buffer *eb,
-				      const struct btrfs_balance_item *bi,
-				      struct btrfs_disk_balance_args *ba)
-{
-	read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
-}
-
-static inline void btrfs_set_balance_meta(struct extent_buffer *eb,
-				  struct btrfs_balance_item *bi,
-				  const struct btrfs_disk_balance_args *ba)
-{
-	write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
-}
-
-static inline void btrfs_balance_sys(const struct extent_buffer *eb,
-				     const struct btrfs_balance_item *bi,
-				     struct btrfs_disk_balance_args *ba)
-{
-	read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
-}
-
-static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
-				 struct btrfs_balance_item *bi,
-				 const struct btrfs_disk_balance_args *ba)
-{
-	write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
-}
-
-static inline void
-btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
-			       const struct btrfs_disk_balance_args *disk)
-{
-	memset(cpu, 0, sizeof(*cpu));
-
-	cpu->profiles = le64_to_cpu(disk->profiles);
-	cpu->usage = le64_to_cpu(disk->usage);
-	cpu->devid = le64_to_cpu(disk->devid);
-	cpu->pstart = le64_to_cpu(disk->pstart);
-	cpu->pend = le64_to_cpu(disk->pend);
-	cpu->vstart = le64_to_cpu(disk->vstart);
-	cpu->vend = le64_to_cpu(disk->vend);
-	cpu->target = le64_to_cpu(disk->target);
-	cpu->flags = le64_to_cpu(disk->flags);
-	cpu->limit = le64_to_cpu(disk->limit);
-	cpu->stripes_min = le32_to_cpu(disk->stripes_min);
-	cpu->stripes_max = le32_to_cpu(disk->stripes_max);
-}
-
-static inline void
-btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
-			       const struct btrfs_balance_args *cpu)
-{
-	memset(disk, 0, sizeof(*disk));
-
-	disk->profiles = cpu_to_le64(cpu->profiles);
-	disk->usage = cpu_to_le64(cpu->usage);
-	disk->devid = cpu_to_le64(cpu->devid);
-	disk->pstart = cpu_to_le64(cpu->pstart);
-	disk->pend = cpu_to_le64(cpu->pend);
-	disk->vstart = cpu_to_le64(cpu->vstart);
-	disk->vend = cpu_to_le64(cpu->vend);
-	disk->target = cpu_to_le64(cpu->target);
-	disk->flags = cpu_to_le64(cpu->flags);
-	disk->limit = cpu_to_le64(cpu->limit);
-	disk->stripes_min = cpu_to_le32(cpu->stripes_min);
-	disk->stripes_max = cpu_to_le32(cpu->stripes_max);
-}
-
-/* struct btrfs_super_block */
-BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
-BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
-BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
-			 generation, 64);
-BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64);
-BTRFS_SETGET_STACK_FUNCS(super_sys_array_size,
-			 struct btrfs_super_block, sys_chunk_array_size, 32);
-BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation,
-			 struct btrfs_super_block, chunk_root_generation, 64);
-BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block,
-			 root_level, 8);
-BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block,
-			 chunk_root, 64);
-BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
-			 chunk_root_level, 8);
-BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block,
-			 log_root, 64);
-BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block,
-			 log_root_level, 8);
-BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
-			 total_bytes, 64);
-BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block,
-			 bytes_used, 64);
-BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block,
-			 sectorsize, 32);
-BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block,
-			 nodesize, 32);
-BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block,
-			 stripesize, 32);
-BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
-			 root_dir_objectid, 64);
-BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
-			 num_devices, 64);
-BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block,
-			 compat_flags, 64);
-BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block,
-			 compat_ro_flags, 64);
-BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
-			 incompat_flags, 64);
-BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
-			 csum_type, 16);
-BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block,
-			 cache_generation, 64);
-BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64);
-BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
-			 uuid_tree_generation, 64);
-
-int btrfs_super_csum_size(const struct btrfs_super_block *s);
-const char *btrfs_super_csum_name(u16 csum_type);
-const char *btrfs_super_csum_driver(u16 csum_type);
-size_t __attribute_const__ btrfs_get_num_csums(void);
-
-
-/*
- * The leaf data grows from end-to-front in the node.
- * this returns the address of the start of the last item,
- * which is the stop of the leaf data stack
- */
-static inline unsigned int leaf_data_end(const struct extent_buffer *leaf)
-{
-	u32 nr = btrfs_header_nritems(leaf);
-
-	if (nr == 0)
-		return BTRFS_LEAF_DATA_SIZE(leaf->fs_info);
-	return btrfs_item_offset(leaf, nr - 1);
-}
-
-/* struct btrfs_file_extent_item */
-BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item,
-			 type, 8);
-BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr,
-			 struct btrfs_file_extent_item, disk_bytenr, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset,
-			 struct btrfs_file_extent_item, offset, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation,
-			 struct btrfs_file_extent_item, generation, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes,
-			 struct btrfs_file_extent_item, num_bytes, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_file_extent_ram_bytes,
-			 struct btrfs_file_extent_item, ram_bytes, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes,
-			 struct btrfs_file_extent_item, disk_num_bytes, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression,
-			 struct btrfs_file_extent_item, compression, 8);
-
-static inline unsigned long
-btrfs_file_extent_inline_start(const struct btrfs_file_extent_item *e)
-{
-	return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START;
-}
-
-static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
-{
-	return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize;
-}
-
-BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
-BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
-		   disk_bytenr, 64);
-BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
-		   generation, 64);
-BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item,
-		   disk_num_bytes, 64);
-BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item,
-		  offset, 64);
-BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item,
-		   num_bytes, 64);
-BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item,
-		   ram_bytes, 64);
-BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item,
-		   compression, 8);
-BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
-		   encryption, 8);
-BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
-		   other_encoding, 16);
-
-/*
- * this returns the number of bytes used by the item on disk, minus the
- * size of any extent headers.  If a file is compressed on disk, this is
- * the compressed size
- */
-static inline u32 btrfs_file_extent_inline_item_len(
-						const struct extent_buffer *eb,
-						int nr)
-{
-	return btrfs_item_size(eb, nr) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
-}
-
-/* btrfs_qgroup_status_item */
-BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item,
-		   generation, 64);
-BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item,
-		   version, 64);
-BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item,
-		   flags, 64);
-BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item,
-		   rescan, 64);
-
-/* btrfs_qgroup_info_item */
-BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item,
-		   generation, 64);
-BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64);
-BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item,
-		   rfer_cmpr, 64);
-BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64);
-BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item,
-		   excl_cmpr, 64);
-
-BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation,
-			 struct btrfs_qgroup_info_item, generation, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item,
-			 rfer, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr,
-			 struct btrfs_qgroup_info_item, rfer_cmpr, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item,
-			 excl, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr,
-			 struct btrfs_qgroup_info_item, excl_cmpr, 64);
-
-/* btrfs_qgroup_limit_item */
-BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item,
-		   flags, 64);
-BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item,
-		   max_rfer, 64);
-BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item,
-		   max_excl, 64);
-BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item,
-		   rsv_rfer, 64);
-BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item,
-		   rsv_excl, 64);
-
-/* btrfs_dev_replace_item */
-BTRFS_SETGET_FUNCS(dev_replace_src_devid,
-		   struct btrfs_dev_replace_item, src_devid, 64);
-BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode,
-		   struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode,
-		   64);
-BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item,
-		   replace_state, 64);
-BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item,
-		   time_started, 64);
-BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item,
-		   time_stopped, 64);
-BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item,
-		   num_write_errors, 64);
-BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors,
-		   struct btrfs_dev_replace_item, num_uncorrectable_read_errors,
-		   64);
-BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item,
-		   cursor_left, 64);
-BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item,
-		   cursor_right, 64);
-
-BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid,
-			 struct btrfs_dev_replace_item, src_devid, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode,
-			 struct btrfs_dev_replace_item,
-			 cont_reading_from_srcdev_mode, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state,
-			 struct btrfs_dev_replace_item, replace_state, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started,
-			 struct btrfs_dev_replace_item, time_started, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped,
-			 struct btrfs_dev_replace_item, time_stopped, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors,
-			 struct btrfs_dev_replace_item, num_write_errors, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors,
-			 struct btrfs_dev_replace_item,
-			 num_uncorrectable_read_errors, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left,
-			 struct btrfs_dev_replace_item, cursor_left, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
-			 struct btrfs_dev_replace_item, cursor_right, 64);
-
-/* btrfs_verity_descriptor_item */
-BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item,
-		   encryption, 8);
-BTRFS_SETGET_FUNCS(verity_descriptor_size, struct btrfs_verity_descriptor_item,
-		   size, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption,
-			 struct btrfs_verity_descriptor_item, encryption, 8);
-BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size,
-			 struct btrfs_verity_descriptor_item, size, 64);
-
-/* helper function to cast into the data area of the leaf. */
-#define btrfs_item_ptr(leaf, slot, type) \
-	((type *)(BTRFS_LEAF_DATA_OFFSET + \
-	btrfs_item_offset(leaf, slot)))
-
-#define btrfs_item_ptr_offset(leaf, slot) \
-	((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \
-	btrfs_item_offset(leaf, slot)))
-
 static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length)
 {
 	return crc32c(crc, address, length);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 2f68570fbb53..012c96de4701 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -17,6 +17,7 @@
 #include "locking.h"
 #include "inode-item.h"
 #include "space-info.h"
+#include "accessors.h"
 
 #define BTRFS_DELAYED_WRITEBACK		512
 #define BTRFS_DELAYED_BACKGROUND	128
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 348aef453e69..94f8975034ce 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -24,6 +24,7 @@
 #include "zoned.h"
 #include "block-group.h"
 #include "fs.h"
+#include "accessors.h"
 
 /*
  * Device replace overview
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index be5c1c2a8da5..9fa37f245c43 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -7,6 +7,7 @@
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
+#include "accessors.h"
 
 /*
  * insert a name into a directory, doing overflow properly if there is a hash
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2c38b4eebd8f..75475a213972 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -44,6 +44,7 @@
 #include "zoned.h"
 #include "subpage.h"
 #include "fs.h"
+#include "accessors.h"
 
 #define BTRFS_SUPER_FLAG_SUPP	(BTRFS_HEADER_FLAG_WRITTEN |\
 				 BTRFS_HEADER_FLAG_RELOC |\
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index a51a5dfa737c..b6bc9684648f 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -7,6 +7,7 @@
 #include "btrfs_inode.h"
 #include "print-tree.h"
 #include "export.h"
+#include "accessors.h"
 
 #define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \
 						 parent_objectid) / 4)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index bc010dbcb6b1..8d28ba360e9a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -37,6 +37,7 @@
 #include "zoned.h"
 #include "dev-replace.h"
 #include "fs.h"
+#include "accessors.h"
 
 #undef SCRAMBLE_DELAYED_REFS
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index aa2d60f683bb..4b47bb8c590f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -31,6 +31,7 @@
 #include "block-group.h"
 #include "compression.h"
 #include "fs.h"
+#include "accessors.h"
 
 static struct kmem_cache *extent_buffer_cache;
 
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 675987e2d652..bce6c8d31bc0 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -18,6 +18,7 @@
 #include "print-tree.h"
 #include "compression.h"
 #include "fs.h"
+#include "accessors.h"
 
 #define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
 				   sizeof(struct btrfs_item) * 2) / \
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a352c7cacc99..62360a554420 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -31,6 +31,7 @@
 #include "reflink.h"
 #include "subpage.h"
 #include "fs.h"
+#include "accessors.h"
 
 static struct kmem_cache *btrfs_inode_defrag_cachep;
 /*
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 703902156f97..26ff1a5100b9 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -26,6 +26,7 @@
 #include "discard.h"
 #include "subpage.h"
 #include "inode-item.h"
+#include "accessors.h"
 
 #define BITS_PER_BITMAP		(PAGE_SIZE * 8UL)
 #define MAX_CACHE_BYTES_PER_GIG	SZ_64K
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 026214d74a02..fc79d21e7b4f 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -13,6 +13,7 @@
 #include "transaction.h"
 #include "block-group.h"
 #include "fs.h"
+#include "accessors.h"
 
 static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
 					struct btrfs_block_group *block_group,
diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c
index a59504b59435..5553e1f8afe8 100644
--- a/fs/btrfs/fs.c
+++ b/fs/btrfs/fs.c
@@ -3,6 +3,7 @@
 #include "messages.h"
 #include "ctree.h"
 #include "fs.h"
+#include "accessors.h"
 
 void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
 			     const char *name)
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 25e9f1d65067..b8dbabfa8b31 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -11,6 +11,7 @@
 #include "transaction.h"
 #include "print-tree.h"
 #include "space-info.h"
+#include "accessors.h"
 
 struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
 						   int slot, const char *name,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 564a4ae9c285..e33be4032fff 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -51,6 +51,7 @@
 #include "block-group.h"
 #include "subpage.h"
 #include "fs.h"
+#include "accessors.h"
 
 #ifdef CONFIG_64BIT
 /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 0eab3cb274a1..870528d87526 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -12,6 +12,7 @@
 #include "ctree.h"
 #include "extent_io.h"
 #include "locking.h"
+#include "accessors.h"
 
 /*
  * Lockdep class keys for extent_buffer->lock's in this root.  For a given
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 708facaede2c..aab7d30eed55 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -7,6 +7,7 @@
 #include "ctree.h"
 #include "disk-io.h"
 #include "print-tree.h"
+#include "accessors.h"
 
 struct root_name_map {
 	u64 id;
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 6d88bf0a9b17..9ad15d69718c 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -13,6 +13,7 @@
 #include "compression.h"
 #include "space-info.h"
 #include "fs.h"
+#include "accessors.h"
 
 #define BTRFS_PROP_HANDLERS_HT_BITS 8
 static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index e87a2f066f4d..39f67ca4b4ca 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -25,6 +25,7 @@
 #include "sysfs.h"
 #include "tree-mod-log.h"
 #include "fs.h"
+#include "accessors.h"
 
 /*
  * Helpers to access qgroup reservation
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 9b09dc50ba14..95d28497de7c 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -12,6 +12,7 @@
 #include "delayed-ref.h"
 #include "ref-verify.h"
 #include "fs.h"
+#include "accessors.h"
 
 /*
  * Used to keep track the roots and number of refs each root has for a given
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index daf65bfad30e..f0243eb33df7 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -11,6 +11,7 @@
 #include "reflink.h"
 #include "transaction.h"
 #include "subpage.h"
+#include "accessors.h"
 
 #define BTRFS_MAX_DEDUPE_LEN	SZ_16M
 
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 977afbb4cd45..3d0a63465a74 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -29,6 +29,7 @@
 #include "inode-item.h"
 #include "space-info.h"
 #include "fs.h"
+#include "accessors.h"
 
 /*
  * Relocation overview
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 112b4bf3c3b8..fc00dfb281d5 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -13,6 +13,7 @@
 #include "print-tree.h"
 #include "qgroup.h"
 #include "space-info.h"
+#include "accessors.h"
 
 /*
  * Read a root item from the tree. In case we detect a root item smaller then
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 6871c1c54f8c..5c6c4aa2fb09 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -22,6 +22,7 @@
 #include "block-group.h"
 #include "zoned.h"
 #include "fs.h"
+#include "accessors.h"
 
 /*
  * This is only the first step towards a full-features scrub. It reads all
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 80104b5af32f..31fb3292e816 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -27,6 +27,7 @@
 #include "compression.h"
 #include "xattr.h"
 #include "print-tree.h"
+#include "accessors.h"
 
 /*
  * Maximum number of references an extent can have in order for us to attempt to
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index af1538c2685d..94d73485454a 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -11,6 +11,7 @@
 #include "block-group.h"
 #include "zoned.h"
 #include "fs.h"
+#include "accessors.h"
 
 /*
  * HOW DOES SPACE RESERVATION WORK
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0839e4a1cfac..c02fcdb9c7c4 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -51,6 +51,7 @@
 #include "qgroup.h"
 #include "raid56.h"
 #include "fs.h"
+#include "accessors.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/btrfs.h>
 
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 47b221372dcf..a14812f1254a 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -23,6 +23,7 @@
 #include "qgroup.h"
 #include "misc.h"
 #include "fs.h"
+#include "accessors.h"
 
 /*
  * Structure name                       Path
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index b7d181a08eab..5ef0b90e25c3 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -8,6 +8,7 @@
 #include "../ctree.h"
 #include "../extent_io.h"
 #include "../disk-io.h"
+#include "../accessors.h"
 
 static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
 {
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index 13734ed43bfc..53a17b1d1744 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -10,6 +10,7 @@
 #include "../free-space-tree.h"
 #include "../transaction.h"
 #include "../block-group.h"
+#include "../accessors.h"
 
 struct free_space_extent {
 	u64 start;
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 625f7d398368..0a34a54ea9fd 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -11,6 +11,7 @@
 #include "../extent_io.h"
 #include "../volumes.h"
 #include "../compression.h"
+#include "../accessors.h"
 
 static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
 			  u64 ram_bytes, u64 offset, u64 disk_bytenr,
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index fd0e0ade8154..65b65d55d1f6 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -11,6 +11,7 @@
 #include "../qgroup.h"
 #include "../backref.h"
 #include "../fs.h"
+#include "../accessors.h"
 
 static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
 				  u64 num_bytes, u64 parent, u64 root_objectid)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 37d0baaa41d8..25e6b504edb4 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -24,6 +24,7 @@
 #include "space-info.h"
 #include "zoned.h"
 #include "fs.h"
+#include "accessors.h"
 
 static struct kmem_cache *btrfs_trans_handle_cachep;
 
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index fa9536110d69..11cafc520b47 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -27,6 +27,7 @@
 #include "misc.h"
 #include "btrfs_inode.h"
 #include "fs.h"
+#include "accessors.h"
 
 /*
  * Error message should follow the following format:
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index b6cf39f4e7e4..5df604846de6 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -9,6 +9,7 @@
 #include "print-tree.h"
 #include "transaction.h"
 #include "locking.h"
+#include "accessors.h"
 
 /*
  * Defrag all the leaves in a given btree.
diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c
index 69083889a9b8..3bea19698a10 100644
--- a/fs/btrfs/tree-mod-log.c
+++ b/fs/btrfs/tree-mod-log.c
@@ -4,6 +4,7 @@
 #include "tree-mod-log.h"
 #include "disk-io.h"
 #include "fs.h"
+#include "accessors.h"
 
 struct tree_mod_root {
 	u64 logical;
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index 190f752a2e10..70304b89f31f 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -11,7 +11,7 @@
 #include "disk-io.h"
 #include "print-tree.h"
 #include "fs.h"
-
+#include "accessors.h"
 
 static void btrfs_uuid_to_key(u8 *uuid, u8 type, struct btrfs_key *key)
 {
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index 35445855df4d..d02fa354fc2b 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -17,6 +17,7 @@
 #include "disk-io.h"
 #include "locking.h"
 #include "fs.h"
+#include "accessors.h"
 
 /*
  * Implementation of the interface defined in struct fsverity_operations.
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d65d5d7835fe..767c56be6b96 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -34,6 +34,7 @@
 #include "discard.h"
 #include "zoned.h"
 #include "fs.h"
+#include "accessors.h"
 
 static struct bio_set btrfs_bioset;
 
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index b26c869f0226..fcf2d5f7e198 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -21,6 +21,7 @@
 #include "disk-io.h"
 #include "props.h"
 #include "locking.h"
+#include "accessors.h"
 
 int btrfs_getxattr(struct inode *inode, const char *name,
 				void *buffer, size_t size)
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 2a7d856c232c..fe66cb929af9 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -16,6 +16,7 @@
 #include "dev-replace.h"
 #include "space-info.h"
 #include "fs.h"
+#include "accessors.h"
 
 /* Maximum number of zones to report per blkdev_report_zones() call */
 #define BTRFS_REPORT_NR_ZONES   4096
-- 
cgit 


From e9c83077d2be815cb04f0b054e9494512b1657ee Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 19 Oct 2022 10:51:01 -0400
Subject: btrfs: remove temporary btrfs_map_token declaration in ctree.h

This was added while I was moving this code to its new home, it can be
removed now.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 20034c23abb2..3dcfb62af68a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -52,8 +52,6 @@ struct btrfs_balance_control;
 struct btrfs_delayed_root;
 struct reloc_control;
 
-struct btrfs_map_token;
-
 #define BTRFS_OLDEST_GENERATION	0ULL
 
 #define BTRFS_EMPTY_DIR_SIZE 0
-- 
cgit 


From d68194b238228ce470de1fc2453851fc06294739 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 14 Oct 2022 15:45:37 +0200
Subject: btrfs: sink gfp_t parameter to btrfs_backref_iter_alloc

There's only one caller that passes GFP_NOFS, we can drop the parameter
an use the flags directly.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c    | 5 ++---
 fs/btrfs/backref.h    | 3 +--
 fs/btrfs/relocation.c | 2 +-
 3 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 41caa6fc6301..f556566b9c79 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -2650,12 +2650,11 @@ void free_ipath(struct inode_fs_paths *ipath)
 	kfree(ipath);
 }
 
-struct btrfs_backref_iter *btrfs_backref_iter_alloc(
-		struct btrfs_fs_info *fs_info, gfp_t gfp_flag)
+struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_backref_iter *ret;
 
-	ret = kzalloc(sizeof(*ret), gfp_flag);
+	ret = kzalloc(sizeof(*ret), GFP_NOFS);
 	if (!ret)
 		return NULL;
 
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index fa3c9cbf9ae7..2dd68f87dca5 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -156,8 +156,7 @@ struct btrfs_backref_iter {
 	u32 end_ptr;
 };
 
-struct btrfs_backref_iter *btrfs_backref_iter_alloc(
-		struct btrfs_fs_info *fs_info, gfp_t gfp_flag);
+struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info);
 
 static inline void btrfs_backref_iter_free(struct btrfs_backref_iter *iter)
 {
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 3d0a63465a74..77d03dce2521 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -473,7 +473,7 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
 	int ret;
 	int err = 0;
 
-	iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info, GFP_NOFS);
+	iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info);
 	if (!iter)
 		return ERR_PTR(-ENOMEM);
 	path = btrfs_alloc_path();
-- 
cgit 


From e2896e791001b778e3bf8466dd8c6104245c08a2 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 14 Oct 2022 15:55:09 +0200
Subject: btrfs: sink gfp_t parameter to btrfs_qgroup_trace_extent

All callers pass GFP_NOFS, we can drop the parameter and use it
directly.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c   | 17 +++++++----------
 fs/btrfs/qgroup.h   |  2 +-
 fs/btrfs/tree-log.c |  3 +--
 3 files changed, 9 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 39f67ca4b4ca..4786b59035a8 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1842,7 +1842,7 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
 }
 
 int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
-			      u64 num_bytes, gfp_t gfp_flag)
+			      u64 num_bytes)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_qgroup_extent_record *record;
@@ -1852,7 +1852,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
 	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)
 	    || bytenr == 0 || num_bytes == 0)
 		return 0;
-	record = kzalloc(sizeof(*record), gfp_flag);
+	record = kzalloc(sizeof(*record), GFP_NOFS);
 	if (!record)
 		return -ENOMEM;
 
@@ -1904,8 +1904,7 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
 
 		num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
 
-		ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes,
-						GFP_NOFS);
+		ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes);
 		if (ret)
 			return ret;
 	}
@@ -2104,12 +2103,11 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
 	 * blocks for qgroup accounting.
 	 */
 	ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start,
-			nodesize, GFP_NOFS);
+					nodesize);
 	if (ret < 0)
 		goto out;
-	ret = btrfs_qgroup_trace_extent(trans,
-			dst_path->nodes[dst_level]->start,
-			nodesize, GFP_NOFS);
+	ret = btrfs_qgroup_trace_extent(trans, dst_path->nodes[dst_level]->start,
+					nodesize);
 	if (ret < 0)
 		goto out;
 
@@ -2393,8 +2391,7 @@ walk_down:
 			path->locks[level] = BTRFS_READ_LOCK;
 
 			ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
-							fs_info->nodesize,
-							GFP_NOFS);
+							fs_info->nodesize);
 			if (ret)
 				goto out;
 		}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 3fb5459c9309..7bffa10589d6 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -321,7 +321,7 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
  * (NULL trans)
  */
 int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
-			      u64 num_bytes, gfp_t gfp_flag);
+			      u64 num_bytes);
 
 /*
  * Inform qgroup to trace all leaf items of data
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index dc49f0aae1fb..8f377aca3409 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -749,8 +749,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 		 */
 		ret = btrfs_qgroup_trace_extent(trans,
 				btrfs_file_extent_disk_bytenr(eb, item),
-				btrfs_file_extent_disk_num_bytes(eb, item),
-				GFP_NOFS);
+				btrfs_file_extent_disk_num_bytes(eb, item));
 		if (ret < 0)
 			goto out;
 
-- 
cgit 


From fe10158c759cc978d0def69b5cb5efabecb42653 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 14 Oct 2022 15:59:35 +0200
Subject: btrfs: switch GFP_NOFS to GFP_KERNEL in scrub_setup_recheck_block

There's only one caller that calls scrub_setup_recheck_block in the
memalloc_nofs_save/_restore protection so it's effectively already
GFP_NOFS and it's safe to use GFP_KERNEL.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 5c6c4aa2fb09..6e70cc348ae2 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1493,7 +1493,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
 			return -EIO;
 		}
 
-		recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
+		recover = kzalloc(sizeof(struct scrub_recover), GFP_KERNEL);
 		if (!recover) {
 			btrfs_put_bioc(bioc);
 			btrfs_bio_counter_dec(fs_info);
@@ -1516,7 +1516,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
 			sblock = sblocks_for_recheck[mirror_index];
 			sblock->sctx = sctx;
 
-			sector = alloc_scrub_sector(sblock, logical, GFP_NOFS);
+			sector = alloc_scrub_sector(sblock, logical, GFP_KERNEL);
 			if (!sector) {
 				spin_lock(&sctx->stat_lock);
 				sctx->stat.malloc_errors++;
-- 
cgit 


From 02bc392798f9bf8105d50b2fec44b5b2e3c80d47 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 14 Oct 2022 16:00:55 +0200
Subject: btrfs: sink gfp_t parameter to alloc_scrub_sector

All callers pas GFP_KERNEL as parameter so we can use it directly in
alloc_scrub_sector.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 6e70cc348ae2..3aed760e1a80 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -297,7 +297,7 @@ static struct scrub_block *alloc_scrub_block(struct scrub_ctx *sctx,
  * Will also allocate new pages for @sblock if needed.
  */
 static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock,
-					       u64 logical, gfp_t gfp)
+					       u64 logical)
 {
 	const pgoff_t page_index = (logical - sblock->logical) >> PAGE_SHIFT;
 	struct scrub_sector *ssector;
@@ -305,7 +305,7 @@ static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock,
 	/* We must never have scrub_block exceed U32_MAX in size. */
 	ASSERT(logical - sblock->logical < U32_MAX);
 
-	ssector = kzalloc(sizeof(*ssector), gfp);
+	ssector = kzalloc(sizeof(*ssector), GFP_KERNEL);
 	if (!ssector)
 		return NULL;
 
@@ -313,7 +313,7 @@ static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock,
 	if (!sblock->pages[page_index]) {
 		int ret;
 
-		sblock->pages[page_index] = alloc_page(gfp);
+		sblock->pages[page_index] = alloc_page(GFP_KERNEL);
 		if (!sblock->pages[page_index]) {
 			kfree(ssector);
 			return NULL;
@@ -1516,7 +1516,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
 			sblock = sblocks_for_recheck[mirror_index];
 			sblock->sctx = sctx;
 
-			sector = alloc_scrub_sector(sblock, logical, GFP_KERNEL);
+			sector = alloc_scrub_sector(sblock, logical);
 			if (!sector) {
 				spin_lock(&sctx->stat_lock);
 				sctx->stat.malloc_errors++;
@@ -2438,7 +2438,7 @@ static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
 		 */
 		u32 l = min(sectorsize, len);
 
-		sector = alloc_scrub_sector(sblock, logical, GFP_KERNEL);
+		sector = alloc_scrub_sector(sblock, logical);
 		if (!sector) {
 			spin_lock(&sctx->stat_lock);
 			sctx->stat.malloc_errors++;
@@ -2769,7 +2769,7 @@ static int scrub_sectors_for_parity(struct scrub_parity *sparity,
 	for (index = 0; len > 0; index++) {
 		struct scrub_sector *sector;
 
-		sector = alloc_scrub_sector(sblock, logical, GFP_KERNEL);
+		sector = alloc_scrub_sector(sblock, logical);
 		if (!sector) {
 			spin_lock(&sctx->stat_lock);
 			sctx->stat.malloc_errors++;
-- 
cgit 


From 82c0efd3cd5d4ecce028da75d29e3345535b3389 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Wed, 19 Oct 2022 21:51:42 +0530
Subject: btrfs: merge module cleanup sequence to one helper

The module exit function exit_btrfs_fs() is duplicating a section of code
in init_btrfs_fs(). Add a helper to remove the duplicated code. Due
to the init/exit section requirements the function must be inline and
not a plain static as it could cause section mismatch.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 28 ++++++++++------------------
 1 file changed, 10 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index c02fcdb9c7c4..75b1c096e4f1 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2837,7 +2837,7 @@ static const struct init_sequence mod_init_seq[] = {
 
 static bool mod_init_result[ARRAY_SIZE(mod_init_seq)];
 
-static void __exit exit_btrfs_fs(void)
+static __always_inline void btrfs_exit_btrfs_fs(void)
 {
 	int i;
 
@@ -2850,6 +2850,11 @@ static void __exit exit_btrfs_fs(void)
 	}
 }
 
+static void __exit exit_btrfs_fs(void)
+{
+	btrfs_exit_btrfs_fs();
+}
+
 static int __init init_btrfs_fs(void)
 {
 	int ret;
@@ -2858,26 +2863,13 @@ static int __init init_btrfs_fs(void)
 	for (i = 0; i < ARRAY_SIZE(mod_init_seq); i++) {
 		ASSERT(!mod_init_result[i]);
 		ret = mod_init_seq[i].init_func();
-		if (ret < 0)
-			goto error;
+		if (ret < 0) {
+			btrfs_exit_btrfs_fs();
+			return ret;
+		}
 		mod_init_result[i] = true;
 	}
 	return 0;
-
-error:
-	/*
-	 * If we call exit_btrfs_fs() it would cause section mismatch.
-	 * As init_btrfs_fs() belongs to .init.text, while exit_btrfs_fs()
-	 * belongs to .exit.text.
-	 */
-	for (i = ARRAY_SIZE(mod_init_seq) - 1; i >= 0; i--) {
-		if (!mod_init_result[i])
-			continue;
-		if (mod_init_seq[i].exit_func)
-			mod_init_seq[i].exit_func();
-		mod_init_result[i] = false;
-	}
-	return ret;
 }
 
 late_initcall(init_btrfs_fs);
-- 
cgit 


From e43eec81c5167b655b72c781b0e75e62a05e415e Mon Sep 17 00:00:00 2001
From: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Date: Thu, 20 Oct 2022 12:58:25 -0400
Subject: btrfs: use struct qstr instead of name and namelen pairs

Many functions throughout btrfs take name buffer and name length
arguments. Most of these functions at the highest level are usually
called with these arguments extracted from a supplied dentry's name.
But the entire name can be passed instead, making each function a little
more elegant.

Each function whose arguments are currently the name and length
extracted from a dentry is herein converted to instead take a pointer to
the name in the dentry. The couple of calls to these calls without a
struct dentry are converted to create an appropriate qstr to pass in.
Additionally, every function which is only called with a name/len
extracted directly from a qstr is also converted.

This change has positive effect on stack consumption, frame of many
functions is reduced but this will be used in the future for fscrypt
related structures.

Signed-off-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h       |  26 +++--
 fs/btrfs/dir-item.c    |  50 +++++----
 fs/btrfs/inode-item.c  |  73 +++++++-------
 fs/btrfs/inode-item.h  |  20 ++--
 fs/btrfs/inode.c       | 130 ++++++++++--------------
 fs/btrfs/ioctl.c       |   7 +-
 fs/btrfs/root-tree.c   |  19 ++--
 fs/btrfs/send.c        |  12 ++-
 fs/btrfs/super.c       |   3 +-
 fs/btrfs/transaction.c |  11 +-
 fs/btrfs/tree-log.c    | 267 ++++++++++++++++++++++---------------------------
 fs/btrfs/tree-log.h    |   4 +-
 12 files changed, 287 insertions(+), 335 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3dcfb62af68a..9e8c9f9bc4fb 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1516,11 +1516,11 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 
 /* root-item.c */
 int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
-		       u64 ref_id, u64 dirid, u64 sequence, const char *name,
-		       int name_len);
+		       u64 ref_id, u64 dirid, u64 sequence,
+		       const struct qstr *name);
 int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
-		       u64 ref_id, u64 dirid, u64 *sequence, const char *name,
-		       int name_len);
+		       u64 ref_id, u64 dirid, u64 *sequence,
+		       const struct qstr *name);
 int btrfs_del_root(struct btrfs_trans_handle *trans,
 		   const struct btrfs_key *key);
 int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -1549,25 +1549,23 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info);
 
 /* dir-item.c */
 int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
-			  const char *name, int name_len);
-int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name,
-			  int name_len, struct btrfs_inode *dir,
+			  const struct qstr *name);
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
+			  const struct qstr *name, struct btrfs_inode *dir,
 			  struct btrfs_key *location, u8 type, u64 index);
 struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
 					     struct btrfs_root *root,
 					     struct btrfs_path *path, u64 dir,
-					     const char *name, int name_len,
-					     int mod);
+					     const struct qstr *name, int mod);
 struct btrfs_dir_item *
 btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root,
 			    struct btrfs_path *path, u64 dir,
-			    u64 index, const char *name, int name_len,
-			    int mod);
+			    u64 index, const struct qstr *name, int mod);
 struct btrfs_dir_item *
 btrfs_search_dir_index_item(struct btrfs_root *root,
 			    struct btrfs_path *path, u64 dirid,
-			    const char *name, int name_len);
+			    const struct qstr *name);
 int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      struct btrfs_path *path,
@@ -1648,10 +1646,10 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
 int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index);
 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 		       struct btrfs_inode *dir, struct btrfs_inode *inode,
-		       const char *name, int name_len);
+		       const struct qstr *name);
 int btrfs_add_link(struct btrfs_trans_handle *trans,
 		   struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
-		   const char *name, int name_len, int add_backref, u64 index);
+		   const struct qstr *name, int add_backref, u64 index);
 int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry);
 int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
 			 int front);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 9fa37f245c43..48a15af4ec57 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -105,8 +105,8 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
  * to use for the second index (if one is created).
  * Will return 0 or -ENOMEM
  */
-int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name,
-			  int name_len, struct btrfs_inode *dir,
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
+			  const struct qstr *name, struct btrfs_inode *dir,
 			  struct btrfs_key *location, u8 type, u64 index)
 {
 	int ret = 0;
@@ -122,7 +122,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name,
 
 	key.objectid = btrfs_ino(dir);
 	key.type = BTRFS_DIR_ITEM_KEY;
-	key.offset = btrfs_name_hash(name, name_len);
+	key.offset = btrfs_name_hash(name->name, name->len);
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -130,9 +130,9 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name,
 
 	btrfs_cpu_key_to_disk(&disk_key, location);
 
-	data_size = sizeof(*dir_item) + name_len;
+	data_size = sizeof(*dir_item) + name->len;
 	dir_item = insert_with_overflow(trans, root, path, &key, data_size,
-					name, name_len);
+					name->name, name->len);
 	if (IS_ERR(dir_item)) {
 		ret = PTR_ERR(dir_item);
 		if (ret == -EEXIST)
@@ -144,11 +144,11 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name,
 	btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
 	btrfs_set_dir_type(leaf, dir_item, type);
 	btrfs_set_dir_data_len(leaf, dir_item, 0);
-	btrfs_set_dir_name_len(leaf, dir_item, name_len);
+	btrfs_set_dir_name_len(leaf, dir_item, name->len);
 	btrfs_set_dir_transid(leaf, dir_item, trans->transid);
 	name_ptr = (unsigned long)(dir_item + 1);
 
-	write_extent_buffer(leaf, name, name_ptr, name_len);
+	write_extent_buffer(leaf, name->name, name_ptr, name->len);
 	btrfs_mark_buffer_dirty(leaf);
 
 second_insert:
@@ -159,7 +159,7 @@ second_insert:
 	}
 	btrfs_release_path(path);
 
-	ret2 = btrfs_insert_delayed_dir_index(trans, name, name_len, dir,
+	ret2 = btrfs_insert_delayed_dir_index(trans, name->name, name->len, dir,
 					      &disk_key, type, index);
 out_free:
 	btrfs_free_path(path);
@@ -208,7 +208,7 @@ static struct btrfs_dir_item *btrfs_lookup_match_dir(
 struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
 					     struct btrfs_root *root,
 					     struct btrfs_path *path, u64 dir,
-					     const char *name, int name_len,
+					     const struct qstr *name,
 					     int mod)
 {
 	struct btrfs_key key;
@@ -216,9 +216,10 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
 
 	key.objectid = dir;
 	key.type = BTRFS_DIR_ITEM_KEY;
-	key.offset = btrfs_name_hash(name, name_len);
+	key.offset = btrfs_name_hash(name->name, name->len);
 
-	di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
+	di = btrfs_lookup_match_dir(trans, root, path, &key, name->name,
+				    name->len, mod);
 	if (IS_ERR(di) && PTR_ERR(di) == -ENOENT)
 		return NULL;
 
@@ -226,7 +227,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
 }
 
 int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
-				   const char *name, int name_len)
+				   const struct qstr *name)
 {
 	int ret;
 	struct btrfs_key key;
@@ -242,9 +243,10 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
 
 	key.objectid = dir;
 	key.type = BTRFS_DIR_ITEM_KEY;
-	key.offset = btrfs_name_hash(name, name_len);
+	key.offset = btrfs_name_hash(name->name, name->len);
 
-	di = btrfs_lookup_match_dir(NULL, root, path, &key, name, name_len, 0);
+	di = btrfs_lookup_match_dir(NULL, root, path, &key, name->name,
+				    name->len, 0);
 	if (IS_ERR(di)) {
 		ret = PTR_ERR(di);
 		/* Nothing found, we're safe */
@@ -264,11 +266,8 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
 		goto out;
 	}
 
-	/*
-	 * see if there is room in the item to insert this
-	 * name
-	 */
-	data_size = sizeof(*di) + name_len;
+	/* See if there is room in the item to insert this name. */
+	data_size = sizeof(*di) + name->len;
 	leaf = path->nodes[0];
 	slot = path->slots[0];
 	if (data_size + btrfs_item_size(leaf, slot) +
@@ -305,8 +304,7 @@ struct btrfs_dir_item *
 btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root,
 			    struct btrfs_path *path, u64 dir,
-			    u64 index, const char *name, int name_len,
-			    int mod)
+			    u64 index, const struct qstr *name, int mod)
 {
 	struct btrfs_dir_item *di;
 	struct btrfs_key key;
@@ -315,7 +313,8 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
 	key.type = BTRFS_DIR_INDEX_KEY;
 	key.offset = index;
 
-	di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
+	di = btrfs_lookup_match_dir(trans, root, path, &key, name->name,
+				    name->len, mod);
 	if (di == ERR_PTR(-ENOENT))
 		return NULL;
 
@@ -323,9 +322,8 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
 }
 
 struct btrfs_dir_item *
-btrfs_search_dir_index_item(struct btrfs_root *root,
-			    struct btrfs_path *path, u64 dirid,
-			    const char *name, int name_len)
+btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path,
+			    u64 dirid, const struct qstr *name)
 {
 	struct btrfs_dir_item *di;
 	struct btrfs_key key;
@@ -340,7 +338,7 @@ btrfs_search_dir_index_item(struct btrfs_root *root,
 			break;
 
 		di = btrfs_match_dir_item_name(root->fs_info, path,
-					       name, name_len);
+					       name->name, name->len);
 		if (di)
 			return di;
 	}
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index b8dbabfa8b31..577e39cfc411 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -14,8 +14,8 @@
 #include "accessors.h"
 
 struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
-						   int slot, const char *name,
-						   int name_len)
+						   int slot,
+						   const struct qstr *name)
 {
 	struct btrfs_inode_ref *ref;
 	unsigned long ptr;
@@ -31,9 +31,10 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
 		len = btrfs_inode_ref_name_len(leaf, ref);
 		name_ptr = (unsigned long)(ref + 1);
 		cur_offset += len + sizeof(*ref);
-		if (len != name_len)
+		if (len != name->len)
 			continue;
-		if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)
+		if (memcmp_extent_buffer(leaf, name->name, name_ptr,
+					 name->len) == 0)
 			return ref;
 	}
 	return NULL;
@@ -41,7 +42,7 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
 
 struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
 		struct extent_buffer *leaf, int slot, u64 ref_objectid,
-		const char *name, int name_len)
+		const struct qstr *name)
 {
 	struct btrfs_inode_extref *extref;
 	unsigned long ptr;
@@ -64,9 +65,10 @@ struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
 		name_ptr = (unsigned long)(&extref->name);
 		ref_name_len = btrfs_inode_extref_name_len(leaf, extref);
 
-		if (ref_name_len == name_len &&
+		if (ref_name_len == name->len &&
 		    btrfs_inode_extref_parent(leaf, extref) == ref_objectid &&
-		    (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0))
+		    (memcmp_extent_buffer(leaf, name->name, name_ptr,
+					  name->len) == 0))
 			return extref;
 
 		cur_offset += ref_name_len + sizeof(*extref);
@@ -79,7 +81,7 @@ struct btrfs_inode_extref *
 btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
 			  struct btrfs_path *path,
-			  const char *name, int name_len,
+			  const struct qstr *name,
 			  u64 inode_objectid, u64 ref_objectid, int ins_len,
 			  int cow)
 {
@@ -88,7 +90,7 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
 
 	key.objectid = inode_objectid;
 	key.type = BTRFS_INODE_EXTREF_KEY;
-	key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+	key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len);
 
 	ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
 	if (ret < 0)
@@ -96,13 +98,13 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
 	if (ret > 0)
 		return NULL;
 	return btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0],
-					      ref_objectid, name, name_len);
+					      ref_objectid, name);
 
 }
 
 static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root,
-				  const char *name, int name_len,
+				  const struct qstr *name,
 				  u64 inode_objectid, u64 ref_objectid,
 				  u64 *index)
 {
@@ -111,14 +113,14 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
 	struct btrfs_inode_extref *extref;
 	struct extent_buffer *leaf;
 	int ret;
-	int del_len = name_len + sizeof(*extref);
+	int del_len = name->len + sizeof(*extref);
 	unsigned long ptr;
 	unsigned long item_start;
 	u32 item_size;
 
 	key.objectid = inode_objectid;
 	key.type = BTRFS_INODE_EXTREF_KEY;
-	key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+	key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len);
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -136,7 +138,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
 	 * readonly.
 	 */
 	extref = btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0],
-						ref_objectid, name, name_len);
+						ref_objectid, name);
 	if (!extref) {
 		btrfs_handle_fs_error(root->fs_info, -ENOENT, NULL);
 		ret = -EROFS;
@@ -172,8 +174,7 @@ out:
 }
 
 int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root,
-			const char *name, int name_len,
+			struct btrfs_root *root, const struct qstr *name,
 			u64 inode_objectid, u64 ref_objectid, u64 *index)
 {
 	struct btrfs_path *path;
@@ -186,7 +187,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
 	u32 sub_item_len;
 	int ret;
 	int search_ext_refs = 0;
-	int del_len = name_len + sizeof(*ref);
+	int del_len = name->len + sizeof(*ref);
 
 	key.objectid = inode_objectid;
 	key.offset = ref_objectid;
@@ -205,8 +206,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 
-	ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], name,
-					 name_len);
+	ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], name);
 	if (!ref) {
 		ret = -ENOENT;
 		search_ext_refs = 1;
@@ -223,7 +223,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 	ptr = (unsigned long)ref;
-	sub_item_len = name_len + sizeof(*ref);
+	sub_item_len = name->len + sizeof(*ref);
 	item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
 	memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
 			      item_size - (ptr + sub_item_len - item_start));
@@ -237,7 +237,7 @@ out:
 		 * name in our ref array. Find and remove the extended
 		 * inode ref then.
 		 */
-		return btrfs_del_inode_extref(trans, root, name, name_len,
+		return btrfs_del_inode_extref(trans, root, name,
 					      inode_objectid, ref_objectid, index);
 	}
 
@@ -251,12 +251,13 @@ out:
  */
 static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root,
-				     const char *name, int name_len,
-				     u64 inode_objectid, u64 ref_objectid, u64 index)
+				     const struct qstr *name,
+				     u64 inode_objectid, u64 ref_objectid,
+				     u64 index)
 {
 	struct btrfs_inode_extref *extref;
 	int ret;
-	int ins_len = name_len + sizeof(*extref);
+	int ins_len = name->len + sizeof(*extref);
 	unsigned long ptr;
 	struct btrfs_path *path;
 	struct btrfs_key key;
@@ -264,7 +265,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
 
 	key.objectid = inode_objectid;
 	key.type = BTRFS_INODE_EXTREF_KEY;
-	key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+	key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len);
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -276,7 +277,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
 		if (btrfs_find_name_in_ext_backref(path->nodes[0],
 						   path->slots[0],
 						   ref_objectid,
-						   name, name_len))
+						   name))
 			goto out;
 
 		btrfs_extend_item(path, ins_len);
@@ -290,12 +291,12 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
 	ptr += btrfs_item_size(leaf, path->slots[0]) - ins_len;
 	extref = (struct btrfs_inode_extref *)ptr;
 
-	btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len);
+	btrfs_set_inode_extref_name_len(path->nodes[0], extref, name->len);
 	btrfs_set_inode_extref_index(path->nodes[0], extref, index);
 	btrfs_set_inode_extref_parent(path->nodes[0], extref, ref_objectid);
 
 	ptr = (unsigned long)&extref->name;
-	write_extent_buffer(path->nodes[0], name, ptr, name_len);
+	write_extent_buffer(path->nodes[0], name->name, ptr, name->len);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 
 out:
@@ -305,8 +306,7 @@ out:
 
 /* Will return 0, -ENOMEM, -EMLINK, or -EEXIST or anything from the CoW path */
 int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root,
-			   const char *name, int name_len,
+			   struct btrfs_root *root, const struct qstr *name,
 			   u64 inode_objectid, u64 ref_objectid, u64 index)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -315,7 +315,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 	struct btrfs_inode_ref *ref;
 	unsigned long ptr;
 	int ret;
-	int ins_len = name_len + sizeof(*ref);
+	int ins_len = name->len + sizeof(*ref);
 
 	key.objectid = inode_objectid;
 	key.offset = ref_objectid;
@@ -331,7 +331,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 	if (ret == -EEXIST) {
 		u32 old_size;
 		ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
-						 name, name_len);
+						 name);
 		if (ref)
 			goto out;
 
@@ -340,7 +340,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 		ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
 				     struct btrfs_inode_ref);
 		ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
-		btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+		btrfs_set_inode_ref_name_len(path->nodes[0], ref, name->len);
 		btrfs_set_inode_ref_index(path->nodes[0], ref, index);
 		ptr = (unsigned long)(ref + 1);
 		ret = 0;
@@ -348,7 +348,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 		if (ret == -EOVERFLOW) {
 			if (btrfs_find_name_in_backref(path->nodes[0],
 						       path->slots[0],
-						       name, name_len))
+						       name))
 				ret = -EEXIST;
 			else
 				ret = -EMLINK;
@@ -357,11 +357,11 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 	} else {
 		ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
 				     struct btrfs_inode_ref);
-		btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+		btrfs_set_inode_ref_name_len(path->nodes[0], ref, name->len);
 		btrfs_set_inode_ref_index(path->nodes[0], ref, index);
 		ptr = (unsigned long)(ref + 1);
 	}
-	write_extent_buffer(path->nodes[0], name, ptr, name_len);
+	write_extent_buffer(path->nodes[0], name->name, ptr, name->len);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 
 out:
@@ -374,7 +374,6 @@ out:
 		if (btrfs_super_incompat_flags(disk_super)
 		    & BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
 			ret = btrfs_insert_inode_extref(trans, root, name,
-							name_len,
 							inode_objectid,
 							ref_objectid, index);
 	}
diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h
index a8fc16d0147f..3c657c670cfd 100644
--- a/fs/btrfs/inode-item.h
+++ b/fs/btrfs/inode-item.h
@@ -64,33 +64,31 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
 			       struct btrfs_truncate_control *control);
 int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root,
-			   const char *name, int name_len,
+			   struct btrfs_root *root, const struct qstr *name,
 			   u64 inode_objectid, u64 ref_objectid, u64 index);
 int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root,
-			   const char *name, int name_len,
-			   u64 inode_objectid, u64 ref_objectid, u64 *index);
+			struct btrfs_root *root, const struct qstr *name,
+			u64 inode_objectid, u64 ref_objectid, u64 *index);
 int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path, u64 objectid);
-int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
-		       *root, struct btrfs_path *path,
+int btrfs_lookup_inode(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, struct btrfs_path *path,
 		       struct btrfs_key *location, int mod);
 
 struct btrfs_inode_extref *btrfs_lookup_inode_extref(
 			  struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
 			  struct btrfs_path *path,
-			  const char *name, int name_len,
+			  const struct qstr *name,
 			  u64 inode_objectid, u64 ref_objectid, int ins_len,
 			  int cow);
 
 struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
-						   int slot, const char *name,
-						   int name_len);
+						   int slot,
+						   const struct qstr *name);
 struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
 		struct extent_buffer *leaf, int slot, u64 ref_objectid,
-		const char *name, int name_len);
+		const struct qstr *name);
 
 #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index cb2a6d7f6252..3162d8f0aaf8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3641,7 +3641,7 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
 	spin_unlock(&fs_info->delayed_iput_lock);
 }
 
-/**
+/*
  * Wait for flushing all delayed iputs
  *
  * @fs_info:  the filesystem
@@ -4286,7 +4286,7 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 				struct btrfs_inode *dir,
 				struct btrfs_inode *inode,
-				const char *name, int name_len,
+				const struct qstr *name,
 				struct btrfs_rename_ctx *rename_ctx)
 {
 	struct btrfs_root *root = dir->root;
@@ -4304,8 +4304,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 
-	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
-				    name, name_len, -1);
+	di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1);
 	if (IS_ERR_OR_NULL(di)) {
 		ret = di ? PTR_ERR(di) : -ENOENT;
 		goto err;
@@ -4333,12 +4332,11 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 		}
 	}
 
-	ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
-				  dir_ino, &index);
+	ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index);
 	if (ret) {
 		btrfs_info(fs_info,
 			"failed to delete reference to %.*s, inode %llu parent %llu",
-			name_len, name, ino, dir_ino);
+			name->len, name->name, ino, dir_ino);
 		btrfs_abort_transaction(trans, ret);
 		goto err;
 	}
@@ -4359,10 +4357,8 @@ skip_backref:
 	 * operations on the log tree, increasing latency for applications.
 	 */
 	if (!rename_ctx) {
-		btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
-					   dir_ino);
-		btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir,
-					     index);
+		btrfs_del_inode_ref_in_log(trans, root, name, inode, dir_ino);
+		btrfs_del_dir_entries_in_log(trans, root, name, dir, index);
 	}
 
 	/*
@@ -4380,7 +4376,7 @@ err:
 	if (ret)
 		goto out;
 
-	btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2);
+	btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2);
 	inode_inc_iversion(&inode->vfs_inode);
 	inode_inc_iversion(&dir->vfs_inode);
 	inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
@@ -4393,10 +4389,11 @@ out:
 
 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 		       struct btrfs_inode *dir, struct btrfs_inode *inode,
-		       const char *name, int name_len)
+		       const struct qstr *name)
 {
 	int ret;
-	ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len, NULL);
+
+	ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL);
 	if (!ret) {
 		drop_nlink(&inode->vfs_inode);
 		ret = btrfs_update_inode(trans, inode->root, inode);
@@ -4440,9 +4437,8 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
 			0);
 
-	ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
-			BTRFS_I(d_inode(dentry)), dentry->d_name.name,
-			dentry->d_name.len);
+	ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
+				 &dentry->d_name);
 	if (ret)
 		goto out;
 
@@ -4467,8 +4463,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
 	struct extent_buffer *leaf;
 	struct btrfs_dir_item *di;
 	struct btrfs_key key;
-	const char *name = dentry->d_name.name;
-	int name_len = dentry->d_name.len;
+	const struct qstr *name = &dentry->d_name;
 	u64 index;
 	int ret;
 	u64 objectid;
@@ -4487,8 +4482,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
-	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
-				   name, name_len, -1);
+	di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1);
 	if (IS_ERR_OR_NULL(di)) {
 		ret = di ? PTR_ERR(di) : -ENOENT;
 		goto out;
@@ -4514,8 +4508,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
 	 * call btrfs_del_root_ref, and it _shouldn't_ fail.
 	 */
 	if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
-		di = btrfs_search_dir_index_item(root, path, dir_ino,
-						 name, name_len);
+		di = btrfs_search_dir_index_item(root, path, dir_ino, name);
 		if (IS_ERR_OR_NULL(di)) {
 			if (!di)
 				ret = -ENOENT;
@@ -4532,7 +4525,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
 	} else {
 		ret = btrfs_del_root_ref(trans, objectid,
 					 root->root_key.objectid, dir_ino,
-					 &index, name, name_len);
+					 &index, name);
 		if (ret) {
 			btrfs_abort_transaction(trans, ret);
 			goto out;
@@ -4545,7 +4538,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 
-	btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2);
+	btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name->len * 2);
 	inode_inc_iversion(dir);
 	dir->i_mtime = current_time(dir);
 	dir->i_ctime = dir->i_mtime;
@@ -4567,6 +4560,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
 	struct btrfs_path *path;
 	struct btrfs_dir_item *di;
 	struct btrfs_key key;
+	struct qstr name = QSTR_INIT("default", 7);
 	u64 dir_id;
 	int ret;
 
@@ -4577,7 +4571,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
 	/* Make sure this root isn't set as the default subvol */
 	dir_id = btrfs_super_root_dir(fs_info->super_copy);
 	di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
-				   dir_id, "default", 7, 0);
+				   dir_id, &name, 0);
 	if (di && !IS_ERR(di)) {
 		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
 		if (key.objectid == root->root_key.objectid) {
@@ -4844,9 +4838,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
 
 	/* now the directory is empty */
-	err = btrfs_unlink_inode(trans, BTRFS_I(dir),
-			BTRFS_I(d_inode(dentry)), dentry->d_name.name,
-			dentry->d_name.len);
+	err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
+				 &dentry->d_name);
 	if (!err) {
 		btrfs_i_size_write(BTRFS_I(inode), 0);
 		/*
@@ -5538,8 +5531,7 @@ no_delete:
 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 			       struct btrfs_key *location, u8 *type)
 {
-	const char *name = dentry->d_name.name;
-	int namelen = dentry->d_name.len;
+	const struct qstr *name = &dentry->d_name;
 	struct btrfs_dir_item *di;
 	struct btrfs_path *path;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
@@ -5550,7 +5542,7 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 		return -ENOMEM;
 
 	di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)),
-			name, namelen, 0);
+				   name, 0);
 	if (IS_ERR_OR_NULL(di)) {
 		ret = di ? PTR_ERR(di) : -ENOENT;
 		goto out;
@@ -5562,7 +5554,7 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 		ret = -EUCLEAN;
 		btrfs_warn(root->fs_info,
 "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
-			   __func__, name, btrfs_ino(BTRFS_I(dir)),
+			   __func__, name->name, btrfs_ino(BTRFS_I(dir)),
 			   location->objectid, location->type, location->offset);
 	}
 	if (!ret)
@@ -6321,8 +6313,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
 {
 	struct inode *dir = args->dir;
 	struct inode *inode = args->inode;
-	const char *name = args->orphan ? NULL : args->dentry->d_name.name;
-	int name_len = args->orphan ? 0 : args->dentry->d_name.len;
+	const struct qstr *name = args->orphan ? NULL : &args->dentry->d_name;
 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
 	struct btrfs_root *root;
 	struct btrfs_inode_item *inode_item;
@@ -6423,7 +6414,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
 			sizes[1] = 2 + sizeof(*ref);
 		} else {
 			key[1].offset = btrfs_ino(BTRFS_I(dir));
-			sizes[1] = name_len + sizeof(*ref);
+			sizes[1] = name->len + sizeof(*ref);
 		}
 	}
 
@@ -6462,10 +6453,12 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
 			btrfs_set_inode_ref_index(path->nodes[0], ref, 0);
 			write_extent_buffer(path->nodes[0], "..", ptr, 2);
 		} else {
-			btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+			btrfs_set_inode_ref_name_len(path->nodes[0], ref,
+						     name->len);
 			btrfs_set_inode_ref_index(path->nodes[0], ref,
 						  BTRFS_I(inode)->dir_index);
-			write_extent_buffer(path->nodes[0], name, ptr, name_len);
+			write_extent_buffer(path->nodes[0], name->name, ptr,
+					    name->len);
 		}
 	}
 
@@ -6526,7 +6519,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
 		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
 	} else {
 		ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
-				     name_len, 0, BTRFS_I(inode)->dir_index);
+				     0, BTRFS_I(inode)->dir_index);
 	}
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
@@ -6555,7 +6548,7 @@ out:
  */
 int btrfs_add_link(struct btrfs_trans_handle *trans,
 		   struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
-		   const char *name, int name_len, int add_backref, u64 index)
+		   const struct qstr *name, int add_backref, u64 index)
 {
 	int ret = 0;
 	struct btrfs_key key;
@@ -6574,17 +6567,17 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
 	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
 		ret = btrfs_add_root_ref(trans, key.objectid,
 					 root->root_key.objectid, parent_ino,
-					 index, name, name_len);
+					 index, name);
 	} else if (add_backref) {
-		ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
-					     parent_ino, index);
+		ret = btrfs_insert_inode_ref(trans, root, name,
+					     ino, parent_ino, index);
 	}
 
 	/* Nothing to clean up yet */
 	if (ret)
 		return ret;
 
-	ret = btrfs_insert_dir_item(trans, name, name_len, parent_inode, &key,
+	ret = btrfs_insert_dir_item(trans, name, parent_inode, &key,
 				    btrfs_inode_type(&inode->vfs_inode), index);
 	if (ret == -EEXIST || ret == -EOVERFLOW)
 		goto fail_dir_item;
@@ -6594,7 +6587,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
 	}
 
 	btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
-			   name_len * 2);
+			   name->len * 2);
 	inode_inc_iversion(&parent_inode->vfs_inode);
 	/*
 	 * If we are replaying a log tree, we do not want to update the mtime
@@ -6619,15 +6612,15 @@ fail_dir_item:
 		int err;
 		err = btrfs_del_root_ref(trans, key.objectid,
 					 root->root_key.objectid, parent_ino,
-					 &local_index, name, name_len);
+					 &local_index, name);
 		if (err)
 			btrfs_abort_transaction(trans, err);
 	} else if (add_backref) {
 		u64 local_index;
 		int err;
 
-		err = btrfs_del_inode_ref(trans, root, name, name_len,
-					  ino, parent_ino, &local_index);
+		err = btrfs_del_inode_ref(trans, root, name, ino, parent_ino,
+					  &local_index);
 		if (err)
 			btrfs_abort_transaction(trans, err);
 	}
@@ -6747,7 +6740,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
 
 	err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
-			     dentry->d_name.name, dentry->d_name.len, 1, index);
+			     &dentry->d_name, 1, index);
 
 	if (err) {
 		drop_inode = 1;
@@ -9088,9 +9081,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 		/* force full log commit if subvolume involved. */
 		btrfs_set_log_full_commit(trans);
 	} else {
-		ret = btrfs_insert_inode_ref(trans, dest,
-					     new_dentry->d_name.name,
-					     new_dentry->d_name.len,
+		ret = btrfs_insert_inode_ref(trans, dest, &new_dentry->d_name,
 					     old_ino,
 					     btrfs_ino(BTRFS_I(new_dir)),
 					     old_idx);
@@ -9104,9 +9095,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 		/* force full log commit if subvolume involved. */
 		btrfs_set_log_full_commit(trans);
 	} else {
-		ret = btrfs_insert_inode_ref(trans, root,
-					     old_dentry->d_name.name,
-					     old_dentry->d_name.len,
+		ret = btrfs_insert_inode_ref(trans, root, &old_dentry->d_name,
 					     new_ino,
 					     btrfs_ino(BTRFS_I(old_dir)),
 					     new_idx);
@@ -9142,8 +9131,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 	} else { /* src is an inode */
 		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
 					   BTRFS_I(old_dentry->d_inode),
-					   old_dentry->d_name.name,
-					   old_dentry->d_name.len,
+					   &old_dentry->d_name,
 					   &old_rename_ctx);
 		if (!ret)
 			ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
@@ -9159,8 +9147,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 	} else { /* dest is an inode */
 		ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
 					   BTRFS_I(new_dentry->d_inode),
-					   new_dentry->d_name.name,
-					   new_dentry->d_name.len,
+					   &new_dentry->d_name,
 					   &new_rename_ctx);
 		if (!ret)
 			ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode));
@@ -9171,16 +9158,14 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 	}
 
 	ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
-			     new_dentry->d_name.name,
-			     new_dentry->d_name.len, 0, old_idx);
+			     &new_dentry->d_name, 0, old_idx);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		goto out_fail;
 	}
 
 	ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
-			     old_dentry->d_name.name,
-			     old_dentry->d_name.len, 0, new_idx);
+			     &old_dentry->d_name, 0, new_idx);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		goto out_fail;
@@ -9281,8 +9266,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
 
 	/* check for collisions, even if the  name isn't there */
 	ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino,
-			     new_dentry->d_name.name,
-			     new_dentry->d_name.len);
+					     &new_dentry->d_name);
 
 	if (ret) {
 		if (ret == -EEXIST) {
@@ -9376,9 +9360,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
 		/* force full log commit if subvolume involved. */
 		btrfs_set_log_full_commit(trans);
 	} else {
-		ret = btrfs_insert_inode_ref(trans, dest,
-					     new_dentry->d_name.name,
-					     new_dentry->d_name.len,
+		ret = btrfs_insert_inode_ref(trans, dest, &new_dentry->d_name,
 					     old_ino,
 					     btrfs_ino(BTRFS_I(new_dir)), index);
 		if (ret)
@@ -9402,10 +9384,8 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
 		ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
 	} else {
 		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
-					BTRFS_I(d_inode(old_dentry)),
-					old_dentry->d_name.name,
-					old_dentry->d_name.len,
-					&rename_ctx);
+					   BTRFS_I(d_inode(old_dentry)),
+					   &old_dentry->d_name, &rename_ctx);
 		if (!ret)
 			ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
 	}
@@ -9424,8 +9404,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
 		} else {
 			ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
 						 BTRFS_I(d_inode(new_dentry)),
-						 new_dentry->d_name.name,
-						 new_dentry->d_name.len);
+						 &new_dentry->d_name);
 		}
 		if (!ret && new_inode->i_nlink == 0)
 			ret = btrfs_orphan_add(trans,
@@ -9437,8 +9416,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
 	}
 
 	ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
-			     new_dentry->d_name.name,
-			     new_dentry->d_name.len, 0, index);
+			     &new_dentry->d_name, 0, index);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		goto out_fail;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e33be4032fff..fde55d0f61b4 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -951,6 +951,7 @@ static noinline int btrfs_mksubvol(const struct path *parent,
 	struct inode *dir = d_inode(parent->dentry);
 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
 	struct dentry *dentry;
+	struct qstr name_str = QSTR_INIT(name, namelen);
 	int error;
 
 	error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
@@ -971,8 +972,7 @@ static noinline int btrfs_mksubvol(const struct path *parent,
 	 * check for them now when we can safely fail
 	 */
 	error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root,
-					       dir->i_ino, name,
-					       namelen);
+					       dir->i_ino, &name_str);
 	if (error)
 		goto out_dput;
 
@@ -3779,6 +3779,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_path *path = NULL;
 	struct btrfs_disk_key disk_key;
+	struct qstr name = QSTR_INIT("default", 7);
 	u64 objectid = 0;
 	u64 dir_id;
 	int ret;
@@ -3822,7 +3823,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 
 	dir_id = btrfs_super_root_dir(fs_info->super_copy);
 	di = btrfs_lookup_dir_item(trans, fs_info->tree_root, path,
-				   dir_id, "default", 7, 1);
+				   dir_id, &name, 1);
 	if (IS_ERR_OR_NULL(di)) {
 		btrfs_release_path(path);
 		btrfs_end_transaction(trans);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index fc00dfb281d5..848a720747af 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -330,9 +330,8 @@ out:
 }
 
 int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
-		       u64 ref_id, u64 dirid, u64 *sequence, const char *name,
-		       int name_len)
-
+		       u64 ref_id, u64 dirid, u64 *sequence,
+		       const struct qstr *name)
 {
 	struct btrfs_root *tree_root = trans->fs_info->tree_root;
 	struct btrfs_path *path;
@@ -359,8 +358,8 @@ again:
 				     struct btrfs_root_ref);
 		ptr = (unsigned long)(ref + 1);
 		if ((btrfs_root_ref_dirid(leaf, ref) != dirid) ||
-		    (btrfs_root_ref_name_len(leaf, ref) != name_len) ||
-		    memcmp_extent_buffer(leaf, name, ptr, name_len)) {
+		    (btrfs_root_ref_name_len(leaf, ref) != name->len) ||
+		    memcmp_extent_buffer(leaf, name->name, ptr, name->len)) {
 			ret = -ENOENT;
 			goto out;
 		}
@@ -403,8 +402,8 @@ out:
  * Will return 0, -ENOMEM, or anything from the CoW path
  */
 int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
-		       u64 ref_id, u64 dirid, u64 sequence, const char *name,
-		       int name_len)
+		       u64 ref_id, u64 dirid, u64 sequence,
+		       const struct qstr *name)
 {
 	struct btrfs_root *tree_root = trans->fs_info->tree_root;
 	struct btrfs_key key;
@@ -423,7 +422,7 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
 	key.offset = ref_id;
 again:
 	ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
-				      sizeof(*ref) + name_len);
+				      sizeof(*ref) + name->len);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		btrfs_free_path(path);
@@ -434,9 +433,9 @@ again:
 	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
 	btrfs_set_root_ref_dirid(leaf, ref, dirid);
 	btrfs_set_root_ref_sequence(leaf, ref, sequence);
-	btrfs_set_root_ref_name_len(leaf, ref, name_len);
+	btrfs_set_root_ref_name_len(leaf, ref, name->len);
 	ptr = (unsigned long)(ref + 1);
-	write_extent_buffer(leaf, name, ptr, name_len);
+	write_extent_buffer(leaf, name->name, ptr, name->len);
 	btrfs_mark_buffer_dirty(leaf);
 
 	if (key.type == BTRFS_ROOT_BACKREF_KEY) {
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 31fb3292e816..d0feeea2ba1c 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1597,13 +1597,17 @@ static int gen_unique_name(struct send_ctx *sctx,
 		return -ENOMEM;
 
 	while (1) {
+		struct qstr tmp_name;
+
 		len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu",
 				ino, gen, idx);
 		ASSERT(len < sizeof(tmp));
+		tmp_name.name = tmp;
+		tmp_name.len = strlen(tmp);
 
 		di = btrfs_lookup_dir_item(NULL, sctx->send_root,
 				path, BTRFS_FIRST_FREE_OBJECTID,
-				tmp, strlen(tmp), 0);
+				&tmp_name, 0);
 		btrfs_release_path(path);
 		if (IS_ERR(di)) {
 			ret = PTR_ERR(di);
@@ -1623,7 +1627,7 @@ static int gen_unique_name(struct send_ctx *sctx,
 
 		di = btrfs_lookup_dir_item(NULL, sctx->parent_root,
 				path, BTRFS_FIRST_FREE_OBJECTID,
-				tmp, strlen(tmp), 0);
+				&tmp_name, 0);
 		btrfs_release_path(path);
 		if (IS_ERR(di)) {
 			ret = PTR_ERR(di);
@@ -1753,13 +1757,13 @@ static int lookup_dir_item_inode(struct btrfs_root *root,
 	struct btrfs_dir_item *di;
 	struct btrfs_key key;
 	struct btrfs_path *path;
+	struct qstr name_str = QSTR_INIT(name, name_len);
 
 	path = alloc_path_for_send();
 	if (!path)
 		return -ENOMEM;
 
-	di = btrfs_lookup_dir_item(NULL, root, path,
-			dir, name, name_len, 0);
+	di = btrfs_lookup_dir_item(NULL, root, path, dir, &name_str, 0);
 	if (IS_ERR_OR_NULL(di)) {
 		ret = di ? PTR_ERR(di) : -ENOENT;
 		goto out;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 75b1c096e4f1..13d33e5707de 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1423,6 +1423,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec
 	struct btrfs_dir_item *di;
 	struct btrfs_path *path;
 	struct btrfs_key location;
+	struct qstr name = QSTR_INIT("default", 7);
 	u64 dir_id;
 
 	path = btrfs_alloc_path();
@@ -1435,7 +1436,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec
 	 * to mount.
 	 */
 	dir_id = btrfs_super_root_dir(fs_info->super_copy);
-	di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
+	di = btrfs_lookup_dir_item(NULL, root, path, dir_id, &name, 0);
 	if (IS_ERR(di)) {
 		btrfs_free_path(path);
 		return PTR_ERR(di);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 25e6b504edb4..7db9612f4d0e 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1678,8 +1678,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	/* check if there is a file/dir which has the same name. */
 	dir_item = btrfs_lookup_dir_item(NULL, parent_root, path,
 					 btrfs_ino(BTRFS_I(parent_inode)),
-					 dentry->d_name.name,
-					 dentry->d_name.len, 0);
+					 &dentry->d_name, 0);
 	if (dir_item != NULL && !IS_ERR(dir_item)) {
 		pending->error = -EEXIST;
 		goto dir_item_existed;
@@ -1774,7 +1773,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	ret = btrfs_add_root_ref(trans, objectid,
 				 parent_root->root_key.objectid,
 				 btrfs_ino(BTRFS_I(parent_inode)), index,
-				 dentry->d_name.name, dentry->d_name.len);
+				 &dentry->d_name);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		goto fail;
@@ -1806,9 +1805,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	if (ret < 0)
 		goto fail;
 
-	ret = btrfs_insert_dir_item(trans, dentry->d_name.name,
-				    dentry->d_name.len, BTRFS_I(parent_inode),
-				    &key, BTRFS_FT_DIR, index);
+	ret = btrfs_insert_dir_item(trans, &dentry->d_name,
+				    BTRFS_I(parent_inode), &key, BTRFS_FT_DIR,
+				    index);
 	/* We have check then name at the beginning, so it is impossible. */
 	BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
 	if (ret) {
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 8f377aca3409..b14ac7017f41 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -597,6 +597,21 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
 	return do_overwrite_item(trans, root, path, eb, slot, key);
 }
 
+static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len,
+			       struct qstr *name)
+{
+	char *buf;
+
+	buf = kmalloc(len, GFP_NOFS);
+	if (!buf)
+		return -ENOMEM;
+
+	read_extent_buffer(eb, buf, (unsigned long)start, len);
+	name->name = buf;
+	name->len = len;
+	return 0;
+}
+
 /*
  * simple helper to read an inode off the disk from a given root
  * This can only be called for subvolume roots and not for the log
@@ -902,12 +917,11 @@ out:
 static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans,
 				       struct btrfs_inode *dir,
 				       struct btrfs_inode *inode,
-				       const char *name,
-				       int name_len)
+				       const struct qstr *name)
 {
 	int ret;
 
-	ret = btrfs_unlink_inode(trans, dir, inode, name, name_len);
+	ret = btrfs_unlink_inode(trans, dir, inode, name);
 	if (ret)
 		return ret;
 	/*
@@ -934,8 +948,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_root *root = dir->root;
 	struct inode *inode;
-	char *name;
-	int name_len;
+	struct qstr name;
 	struct extent_buffer *leaf;
 	struct btrfs_key location;
 	int ret;
@@ -943,12 +956,10 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
 	leaf = path->nodes[0];
 
 	btrfs_dir_item_key_to_cpu(leaf, di, &location);
-	name_len = btrfs_dir_name_len(leaf, di);
-	name = kmalloc(name_len, GFP_NOFS);
-	if (!name)
+	ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name);
+	if (ret)
 		return -ENOMEM;
 
-	read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
 	btrfs_release_path(path);
 
 	inode = read_one_inode(root, location.objectid);
@@ -961,10 +972,9 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
 	if (ret)
 		goto out;
 
-	ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), name,
-			name_len);
+	ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), &name);
 out:
-	kfree(name);
+	kfree(name.name);
 	iput(inode);
 	return ret;
 }
@@ -979,14 +989,14 @@ out:
 static noinline int inode_in_dir(struct btrfs_root *root,
 				 struct btrfs_path *path,
 				 u64 dirid, u64 objectid, u64 index,
-				 const char *name, int name_len)
+				 struct qstr *name)
 {
 	struct btrfs_dir_item *di;
 	struct btrfs_key location;
 	int ret = 0;
 
 	di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
-					 index, name, name_len, 0);
+					 index, name, 0);
 	if (IS_ERR(di)) {
 		ret = PTR_ERR(di);
 		goto out;
@@ -999,7 +1009,7 @@ static noinline int inode_in_dir(struct btrfs_root *root,
 	}
 
 	btrfs_release_path(path);
-	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
+	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, 0);
 	if (IS_ERR(di)) {
 		ret = PTR_ERR(di);
 		goto out;
@@ -1026,7 +1036,7 @@ out:
 static noinline int backref_in_log(struct btrfs_root *log,
 				   struct btrfs_key *key,
 				   u64 ref_objectid,
-				   const char *name, int namelen)
+				   const struct qstr *name)
 {
 	struct btrfs_path *path;
 	int ret;
@@ -1046,12 +1056,10 @@ static noinline int backref_in_log(struct btrfs_root *log,
 	if (key->type == BTRFS_INODE_EXTREF_KEY)
 		ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
 						       path->slots[0],
-						       ref_objectid,
-						       name, namelen);
+						       ref_objectid, name);
 	else
 		ret = !!btrfs_find_name_in_backref(path->nodes[0],
-						   path->slots[0],
-						   name, namelen);
+						   path->slots[0], name);
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -1064,11 +1072,9 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
 				  struct btrfs_inode *dir,
 				  struct btrfs_inode *inode,
 				  u64 inode_objectid, u64 parent_objectid,
-				  u64 ref_index, char *name, int namelen)
+				  u64 ref_index, struct qstr *name)
 {
 	int ret;
-	char *victim_name;
-	int victim_name_len;
 	struct extent_buffer *leaf;
 	struct btrfs_dir_item *di;
 	struct btrfs_key search_key;
@@ -1100,43 +1106,40 @@ again:
 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
 		ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]);
 		while (ptr < ptr_end) {
-			victim_ref = (struct btrfs_inode_ref *)ptr;
-			victim_name_len = btrfs_inode_ref_name_len(leaf,
-								   victim_ref);
-			victim_name = kmalloc(victim_name_len, GFP_NOFS);
-			if (!victim_name)
-				return -ENOMEM;
+			struct qstr victim_name;
 
-			read_extent_buffer(leaf, victim_name,
-					   (unsigned long)(victim_ref + 1),
-					   victim_name_len);
+			victim_ref = (struct btrfs_inode_ref *)ptr;
+			ret = read_alloc_one_name(leaf, (victim_ref + 1),
+				 btrfs_inode_ref_name_len(leaf, victim_ref),
+				 &victim_name);
+			if (ret)
+				return ret;
 
 			ret = backref_in_log(log_root, &search_key,
-					     parent_objectid, victim_name,
-					     victim_name_len);
+					     parent_objectid, &victim_name);
 			if (ret < 0) {
-				kfree(victim_name);
+				kfree(victim_name.name);
 				return ret;
 			} else if (!ret) {
 				inc_nlink(&inode->vfs_inode);
 				btrfs_release_path(path);
 
 				ret = unlink_inode_for_log_replay(trans, dir, inode,
-						victim_name, victim_name_len);
-				kfree(victim_name);
+						&victim_name);
+				kfree(victim_name.name);
 				if (ret)
 					return ret;
 				goto again;
 			}
-			kfree(victim_name);
+			kfree(victim_name.name);
 
-			ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
+			ptr = (unsigned long)(victim_ref + 1) + victim_name.len;
 		}
 	}
 	btrfs_release_path(path);
 
 	/* Same search but for extended refs */
-	extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
+	extref = btrfs_lookup_inode_extref(NULL, root, path, name,
 					   inode_objectid, parent_objectid, 0,
 					   0);
 	if (IS_ERR(extref)) {
@@ -1153,29 +1156,28 @@ again:
 		base = btrfs_item_ptr_offset(leaf, path->slots[0]);
 
 		while (cur_offset < item_size) {
-			extref = (struct btrfs_inode_extref *)(base + cur_offset);
+			struct qstr victim_name;
 
-			victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
+			extref = (struct btrfs_inode_extref *)(base + cur_offset);
 
 			if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
 				goto next;
 
-			victim_name = kmalloc(victim_name_len, GFP_NOFS);
-			if (!victim_name)
-				return -ENOMEM;
-			read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
-					   victim_name_len);
+			ret = read_alloc_one_name(leaf, &extref->name,
+				 btrfs_inode_extref_name_len(leaf, extref),
+				 &victim_name);
+			if (ret)
+				return ret;
 
 			search_key.objectid = inode_objectid;
 			search_key.type = BTRFS_INODE_EXTREF_KEY;
 			search_key.offset = btrfs_extref_hash(parent_objectid,
-							      victim_name,
-							      victim_name_len);
+							      victim_name.name,
+							      victim_name.len);
 			ret = backref_in_log(log_root, &search_key,
-					     parent_objectid, victim_name,
-					     victim_name_len);
+					     parent_objectid, &victim_name);
 			if (ret < 0) {
-				kfree(victim_name);
+				kfree(victim_name.name);
 				return ret;
 			} else if (!ret) {
 				ret = -ENOENT;
@@ -1187,26 +1189,24 @@ again:
 
 					ret = unlink_inode_for_log_replay(trans,
 							BTRFS_I(victim_parent),
-							inode,
-							victim_name,
-							victim_name_len);
+							inode, &victim_name);
 				}
 				iput(victim_parent);
-				kfree(victim_name);
+				kfree(victim_name.name);
 				if (ret)
 					return ret;
 				goto again;
 			}
-			kfree(victim_name);
+			kfree(victim_name.name);
 next:
-			cur_offset += victim_name_len + sizeof(*extref);
+			cur_offset += victim_name.len + sizeof(*extref);
 		}
 	}
 	btrfs_release_path(path);
 
 	/* look for a conflicting sequence number */
 	di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
-					 ref_index, name, namelen, 0);
+					 ref_index, name, 0);
 	if (IS_ERR(di)) {
 		return PTR_ERR(di);
 	} else if (di) {
@@ -1217,8 +1217,7 @@ next:
 	btrfs_release_path(path);
 
 	/* look for a conflicting name */
-	di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
-				   name, namelen, 0);
+	di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, 0);
 	if (IS_ERR(di)) {
 		return PTR_ERR(di);
 	} else if (di) {
@@ -1232,20 +1231,18 @@ next:
 }
 
 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
-			     u32 *namelen, char **name, u64 *index,
+			     struct qstr *name, u64 *index,
 			     u64 *parent_objectid)
 {
 	struct btrfs_inode_extref *extref;
+	int ret;
 
 	extref = (struct btrfs_inode_extref *)ref_ptr;
 
-	*namelen = btrfs_inode_extref_name_len(eb, extref);
-	*name = kmalloc(*namelen, GFP_NOFS);
-	if (*name == NULL)
-		return -ENOMEM;
-
-	read_extent_buffer(eb, *name, (unsigned long)&extref->name,
-			   *namelen);
+	ret = read_alloc_one_name(eb, &extref->name,
+				  btrfs_inode_extref_name_len(eb, extref), name);
+	if (ret)
+		return ret;
 
 	if (index)
 		*index = btrfs_inode_extref_index(eb, extref);
@@ -1256,18 +1253,17 @@ static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
 }
 
 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
-			  u32 *namelen, char **name, u64 *index)
+			  struct qstr *name, u64 *index)
 {
 	struct btrfs_inode_ref *ref;
+	int ret;
 
 	ref = (struct btrfs_inode_ref *)ref_ptr;
 
-	*namelen = btrfs_inode_ref_name_len(eb, ref);
-	*name = kmalloc(*namelen, GFP_NOFS);
-	if (*name == NULL)
-		return -ENOMEM;
-
-	read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
+	ret = read_alloc_one_name(eb, ref + 1, btrfs_inode_ref_name_len(eb, ref),
+				  name);
+	if (ret)
+		return ret;
 
 	if (index)
 		*index = btrfs_inode_ref_index(eb, ref);
@@ -1309,28 +1305,24 @@ again:
 	ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
 	ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]);
 	while (ref_ptr < ref_end) {
-		char *name = NULL;
-		int namelen;
+		struct qstr name;
 		u64 parent_id;
 
 		if (key->type == BTRFS_INODE_EXTREF_KEY) {
-			ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
+			ret = extref_get_fields(eb, ref_ptr, &name,
 						NULL, &parent_id);
 		} else {
 			parent_id = key->offset;
-			ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
-					     NULL);
+			ret = ref_get_fields(eb, ref_ptr, &name, NULL);
 		}
 		if (ret)
 			goto out;
 
 		if (key->type == BTRFS_INODE_EXTREF_KEY)
 			ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
-							       parent_id, name,
-							       namelen);
+							       parent_id, &name);
 		else
-			ret = !!btrfs_find_name_in_backref(log_eb, log_slot,
-							   name, namelen);
+			ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name);
 
 		if (!ret) {
 			struct inode *dir;
@@ -1339,20 +1331,20 @@ again:
 			dir = read_one_inode(root, parent_id);
 			if (!dir) {
 				ret = -ENOENT;
-				kfree(name);
+				kfree(name.name);
 				goto out;
 			}
 			ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir),
-						 inode, name, namelen);
-			kfree(name);
+						 inode, &name);
+			kfree(name.name);
 			iput(dir);
 			if (ret)
 				goto out;
 			goto again;
 		}
 
-		kfree(name);
-		ref_ptr += namelen;
+		kfree(name.name);
+		ref_ptr += name.len;
 		if (key->type == BTRFS_INODE_EXTREF_KEY)
 			ref_ptr += sizeof(struct btrfs_inode_extref);
 		else
@@ -1381,8 +1373,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 	struct inode *inode = NULL;
 	unsigned long ref_ptr;
 	unsigned long ref_end;
-	char *name = NULL;
-	int namelen;
+	struct qstr name;
 	int ret;
 	int log_ref_ver = 0;
 	u64 parent_objectid;
@@ -1426,7 +1417,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 
 	while (ref_ptr < ref_end) {
 		if (log_ref_ver) {
-			ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
+			ret = extref_get_fields(eb, ref_ptr, &name,
 						&ref_index, &parent_objectid);
 			/*
 			 * parent object can change from one array
@@ -1439,15 +1430,13 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 				goto out;
 			}
 		} else {
-			ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
-					     &ref_index);
+			ret = ref_get_fields(eb, ref_ptr, &name, &ref_index);
 		}
 		if (ret)
 			goto out;
 
 		ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
-				   btrfs_ino(BTRFS_I(inode)), ref_index,
-				   name, namelen);
+				   btrfs_ino(BTRFS_I(inode)), ref_index, &name);
 		if (ret < 0) {
 			goto out;
 		} else if (ret == 0) {
@@ -1461,7 +1450,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 			ret = __add_inode_ref(trans, root, path, log,
 					      BTRFS_I(dir), BTRFS_I(inode),
 					      inode_objectid, parent_objectid,
-					      ref_index, name, namelen);
+					      ref_index, &name);
 			if (ret) {
 				if (ret == 1)
 					ret = 0;
@@ -1470,7 +1459,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 
 			/* insert our name */
 			ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
-					     name, namelen, 0, ref_index);
+					     &name, 0, ref_index);
 			if (ret)
 				goto out;
 
@@ -1480,9 +1469,9 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 		}
 		/* Else, ret == 1, we already have a perfect match, we're done. */
 
-		ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
-		kfree(name);
-		name = NULL;
+		ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len;
+		kfree(name.name);
+		name.name = NULL;
 		if (log_ref_ver) {
 			iput(dir);
 			dir = NULL;
@@ -1506,7 +1495,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 	ret = overwrite_item(trans, root, path, eb, slot, key);
 out:
 	btrfs_release_path(path);
-	kfree(name);
+	kfree(name.name);
 	iput(dir);
 	iput(inode);
 	return ret;
@@ -1778,7 +1767,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
 static noinline int insert_one_name(struct btrfs_trans_handle *trans,
 				    struct btrfs_root *root,
 				    u64 dirid, u64 index,
-				    char *name, int name_len,
+				    const struct qstr *name,
 				    struct btrfs_key *location)
 {
 	struct inode *inode;
@@ -1796,7 +1785,7 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
 	}
 
 	ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
-			name_len, 1, index);
+			     1, index);
 
 	/* FIXME, put inode into FIXUP list */
 
@@ -1856,8 +1845,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 				    struct btrfs_dir_item *di,
 				    struct btrfs_key *key)
 {
-	char *name;
-	int name_len;
+	struct qstr name;
 	struct btrfs_dir_item *dir_dst_di;
 	struct btrfs_dir_item *index_dst_di;
 	bool dir_dst_matches = false;
@@ -1875,17 +1863,11 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 	if (!dir)
 		return -EIO;
 
-	name_len = btrfs_dir_name_len(eb, di);
-	name = kmalloc(name_len, GFP_NOFS);
-	if (!name) {
-		ret = -ENOMEM;
+	ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
+	if (ret)
 		goto out;
-	}
 
 	log_type = btrfs_dir_type(eb, di);
-	read_extent_buffer(eb, name, (unsigned long)(di + 1),
-		   name_len);
-
 	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
 	ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
 	btrfs_release_path(path);
@@ -1895,7 +1877,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 	ret = 0;
 
 	dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
-					   name, name_len, 1);
+					   &name, 1);
 	if (IS_ERR(dir_dst_di)) {
 		ret = PTR_ERR(dir_dst_di);
 		goto out;
@@ -1912,7 +1894,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 
 	index_dst_di = btrfs_lookup_dir_index_item(trans, root, path,
 						   key->objectid, key->offset,
-						   name, name_len, 1);
+						   &name, 1);
 	if (IS_ERR(index_dst_di)) {
 		ret = PTR_ERR(index_dst_di);
 		goto out;
@@ -1940,7 +1922,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 	search_key.objectid = log_key.objectid;
 	search_key.type = BTRFS_INODE_REF_KEY;
 	search_key.offset = key->objectid;
-	ret = backref_in_log(root->log_root, &search_key, 0, name, name_len);
+	ret = backref_in_log(root->log_root, &search_key, 0, &name);
 	if (ret < 0) {
 	        goto out;
 	} else if (ret) {
@@ -1953,8 +1935,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 	search_key.objectid = log_key.objectid;
 	search_key.type = BTRFS_INODE_EXTREF_KEY;
 	search_key.offset = key->objectid;
-	ret = backref_in_log(root->log_root, &search_key, key->objectid, name,
-			     name_len);
+	ret = backref_in_log(root->log_root, &search_key, key->objectid, &name);
 	if (ret < 0) {
 		goto out;
 	} else if (ret) {
@@ -1965,7 +1946,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 	}
 	btrfs_release_path(path);
 	ret = insert_one_name(trans, root, key->objectid, key->offset,
-			      name, name_len, &log_key);
+			      &name, &log_key);
 	if (ret && ret != -ENOENT && ret != -EEXIST)
 		goto out;
 	if (!ret)
@@ -1975,10 +1956,10 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 
 out:
 	if (!ret && update_size) {
-		btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
+		btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2);
 		ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
 	}
-	kfree(name);
+	kfree(name.name);
 	iput(dir);
 	if (!ret && name_added)
 		ret = 1;
@@ -2144,8 +2125,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
 	struct extent_buffer *eb;
 	int slot;
 	struct btrfs_dir_item *di;
-	int name_len;
-	char *name;
+	struct qstr name;
 	struct inode *inode = NULL;
 	struct btrfs_key location;
 
@@ -2160,22 +2140,16 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
 	eb = path->nodes[0];
 	slot = path->slots[0];
 	di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
-	name_len = btrfs_dir_name_len(eb, di);
-	name = kmalloc(name_len, GFP_NOFS);
-	if (!name) {
-		ret = -ENOMEM;
+	ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
+	if (ret)
 		goto out;
-	}
-
-	read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len);
 
 	if (log) {
 		struct btrfs_dir_item *log_di;
 
 		log_di = btrfs_lookup_dir_index_item(trans, log, log_path,
 						     dir_key->objectid,
-						     dir_key->offset,
-						     name, name_len, 0);
+						     dir_key->offset, &name, 0);
 		if (IS_ERR(log_di)) {
 			ret = PTR_ERR(log_di);
 			goto out;
@@ -2201,7 +2175,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
 
 	inc_nlink(inode);
 	ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode),
-					  name, name_len);
+					  &name);
 	/*
 	 * Unlike dir item keys, dir index keys can only have one name (entry) in
 	 * them, as there are no key collisions since each key has a unique offset
@@ -2210,7 +2184,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
 out:
 	btrfs_release_path(path);
 	btrfs_release_path(log_path);
-	kfree(name);
+	kfree(name.name);
 	iput(inode);
 	return ret;
 }
@@ -3449,7 +3423,7 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *log,
 			     struct btrfs_path *path,
 			     u64 dir_ino,
-			     const char *name, int name_len,
+			     const struct qstr *name,
 			     u64 index)
 {
 	struct btrfs_dir_item *di;
@@ -3459,7 +3433,7 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans,
 	 * for dir item keys.
 	 */
 	di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
-					 index, name, name_len, -1);
+					 index, name, -1);
 	if (IS_ERR(di))
 		return PTR_ERR(di);
 	else if (!di)
@@ -3496,7 +3470,7 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans,
  */
 void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root,
-				  const char *name, int name_len,
+				  const struct qstr *name,
 				  struct btrfs_inode *dir, u64 index)
 {
 	struct btrfs_path *path;
@@ -3523,7 +3497,7 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 	}
 
 	ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir),
-				name, name_len, index);
+				name, index);
 	btrfs_free_path(path);
 out_unlock:
 	mutex_unlock(&dir->log_mutex);
@@ -3535,7 +3509,7 @@ out_unlock:
 /* see comments for btrfs_del_dir_entries_in_log */
 void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
-				const char *name, int name_len,
+				const struct qstr *name,
 				struct btrfs_inode *inode, u64 dirid)
 {
 	struct btrfs_root *log;
@@ -3556,7 +3530,7 @@ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
 	log = root->log_root;
 	mutex_lock(&inode->log_mutex);
 
-	ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
+	ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode),
 				  dirid, &index);
 	mutex_unlock(&inode->log_mutex);
 	if (ret < 0 && ret != -ENOENT)
@@ -5270,6 +5244,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
 		u32 this_len;
 		unsigned long name_ptr;
 		struct btrfs_dir_item *di;
+		struct qstr name_str;
 
 		if (key->type == BTRFS_INODE_REF_KEY) {
 			struct btrfs_inode_ref *iref;
@@ -5303,8 +5278,11 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
 		}
 
 		read_extent_buffer(eb, name, name_ptr, this_name_len);
+
+		name_str.name = name;
+		name_str.len = this_name_len;
 		di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
-				parent, name, this_name_len, 0);
+				parent, &name_str, 0);
 		if (di && !IS_ERR(di)) {
 			struct btrfs_key di_key;
 
@@ -7505,8 +7483,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
 		 */
 		mutex_lock(&old_dir->log_mutex);
 		ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir),
-					old_dentry->d_name.name,
-					old_dentry->d_name.len, old_dir_index);
+					&old_dentry->d_name, old_dir_index);
 		if (ret > 0) {
 			/*
 			 * The dentry does not exist in the log, so record its
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index f5770829d075..8c9a387151b0 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -87,11 +87,11 @@ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
 			  struct btrfs_log_ctx *ctx);
 void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root,
-				  const char *name, int name_len,
+				  const struct qstr *name,
 				  struct btrfs_inode *dir, u64 index);
 void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
-				const char *name, int name_len,
+				const struct qstr *name,
 				struct btrfs_inode *inode, u64 dirid);
 void btrfs_end_log_trans(struct btrfs_root *root);
 void btrfs_pin_log_trans(struct btrfs_root *root);
-- 
cgit 


From ab3c5c18e8fa3f8ea116016095d25adab466cd39 Mon Sep 17 00:00:00 2001
From: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Date: Thu, 20 Oct 2022 12:58:26 -0400
Subject: btrfs: setup qstr from dentrys using fscrypt helper

Most places where we get a struct qstr, we are doing so from a dentry.
With fscrypt, the dentry's name may be encrypted on-disk, so fscrypt
provides a helper to convert a dentry name to the appropriate disk name
if necessary. Convert each of the dentry name accesses to use
fscrypt_setup_filename(), then convert the resulting fscrypt_name back
to an unencrypted qstr. This does not work for nokey names, but the
specific locations that could spawn nokey names are noted.

At present, since there are no encrypted directories, nothing goes down
the filename encryption paths.

Signed-off-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h       |   3 +
 fs/btrfs/inode.c       | 192 +++++++++++++++++++++++++++++++++++++------------
 fs/btrfs/transaction.c |  40 +++++++----
 fs/btrfs/tree-log.c    |  11 ++-
 4 files changed, 189 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9e8c9f9bc4fb..b2016c14488b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -28,6 +28,7 @@
 #include <linux/refcount.h>
 #include <linux/crc32c.h>
 #include <linux/iomap.h>
+#include <linux/fscrypt.h>
 #include "extent-io-tree.h"
 #include "extent_io.h"
 #include "extent_map.h"
@@ -1674,6 +1675,8 @@ struct btrfs_new_inode_args {
 	 */
 	struct posix_acl *default_acl;
 	struct posix_acl *acl;
+	struct fscrypt_name fname;
+	struct qstr name;
 };
 int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
 			    unsigned int *trans_num_items);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3162d8f0aaf8..fa4b101bc1eb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4429,28 +4429,41 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	struct btrfs_trans_handle *trans;
 	struct inode *inode = d_inode(dentry);
 	int ret;
+	struct fscrypt_name fname;
+	struct qstr name;
+
+	ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
+	if (ret)
+		return ret;
+	name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name);
+
+	/* This needs to handle no-key deletions later on */
 
 	trans = __unlink_start_trans(dir);
-	if (IS_ERR(trans))
-		return PTR_ERR(trans);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto fscrypt_free;
+	}
 
 	btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
 			0);
 
 	ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
-				 &dentry->d_name);
+				 &name);
 	if (ret)
-		goto out;
+		goto end_trans;
 
 	if (inode->i_nlink == 0) {
 		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
 		if (ret)
-			goto out;
+			goto end_trans;
 	}
 
-out:
+end_trans:
 	btrfs_end_transaction(trans);
 	btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
+fscrypt_free:
+	fscrypt_free_filename(&fname);
 	return ret;
 }
 
@@ -4463,11 +4476,19 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
 	struct extent_buffer *leaf;
 	struct btrfs_dir_item *di;
 	struct btrfs_key key;
-	const struct qstr *name = &dentry->d_name;
+	struct qstr name;
 	u64 index;
 	int ret;
 	u64 objectid;
 	u64 dir_ino = btrfs_ino(BTRFS_I(dir));
+	struct fscrypt_name fname;
+
+	ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
+	if (ret)
+		return ret;
+	name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name);
+
+	/* This needs to handle no-key deletions later on */
 
 	if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
 		objectid = inode->root->root_key.objectid;
@@ -4475,14 +4496,17 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
 		objectid = inode->location.objectid;
 	} else {
 		WARN_ON(1);
+		fscrypt_free_filename(&fname);
 		return -EINVAL;
 	}
 
 	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
-	di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1);
+	di = btrfs_lookup_dir_item(trans, root, path, dir_ino, &name, -1);
 	if (IS_ERR_OR_NULL(di)) {
 		ret = di ? PTR_ERR(di) : -ENOENT;
 		goto out;
@@ -4508,7 +4532,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
 	 * call btrfs_del_root_ref, and it _shouldn't_ fail.
 	 */
 	if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
-		di = btrfs_search_dir_index_item(root, path, dir_ino, name);
+		di = btrfs_search_dir_index_item(root, path, dir_ino, &name);
 		if (IS_ERR_OR_NULL(di)) {
 			if (!di)
 				ret = -ENOENT;
@@ -4525,7 +4549,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
 	} else {
 		ret = btrfs_del_root_ref(trans, objectid,
 					 root->root_key.objectid, dir_ino,
-					 &index, name);
+					 &index, &name);
 		if (ret) {
 			btrfs_abort_transaction(trans, ret);
 			goto out;
@@ -4538,7 +4562,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 
-	btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name->len * 2);
+	btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name.len * 2);
 	inode_inc_iversion(dir);
 	dir->i_mtime = current_time(dir);
 	dir->i_ctime = dir->i_mtime;
@@ -4547,6 +4571,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
 		btrfs_abort_transaction(trans, ret);
 out:
 	btrfs_free_path(path);
+	fscrypt_free_filename(&fname);
 	return ret;
 }
 
@@ -4810,6 +4835,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	int err = 0;
 	struct btrfs_trans_handle *trans;
 	u64 last_unlink_trans;
+	struct fscrypt_name fname;
+	struct qstr name;
 
 	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
 		return -ENOTEMPTY;
@@ -4822,9 +4849,18 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 		return btrfs_delete_subvolume(dir, dentry);
 	}
 
+	err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
+	if (err)
+		return err;
+	name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name);
+
+	/* This needs to handle no-key deletions later on */
+
 	trans = __unlink_start_trans(dir);
-	if (IS_ERR(trans))
-		return PTR_ERR(trans);
+	if (IS_ERR(trans)) {
+		err = PTR_ERR(trans);
+		goto out_notrans;
+	}
 
 	if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
 		err = btrfs_unlink_subvol(trans, dir, dentry);
@@ -4839,7 +4875,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 
 	/* now the directory is empty */
 	err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
-				 &dentry->d_name);
+				 &name);
 	if (!err) {
 		btrfs_i_size_write(BTRFS_I(inode), 0);
 		/*
@@ -4858,7 +4894,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	}
 out:
 	btrfs_end_transaction(trans);
+out_notrans:
 	btrfs_btree_balance_dirty(fs_info);
+	fscrypt_free_filename(&fname);
 
 	return err;
 }
@@ -5531,18 +5569,27 @@ no_delete:
 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 			       struct btrfs_key *location, u8 *type)
 {
-	const struct qstr *name = &dentry->d_name;
+	struct qstr name;
 	struct btrfs_dir_item *di;
 	struct btrfs_path *path;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	int ret = 0;
+	struct fscrypt_name fname;
 
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
+	ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
+	if (ret)
+		goto out;
+
+	name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name);
+
+	/* This needs to handle no-key deletions later on */
+
 	di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)),
-				   name, 0);
+				   &name, 0);
 	if (IS_ERR_OR_NULL(di)) {
 		ret = di ? PTR_ERR(di) : -ENOENT;
 		goto out;
@@ -5554,12 +5601,13 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 		ret = -EUCLEAN;
 		btrfs_warn(root->fs_info,
 "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
-			   __func__, name->name, btrfs_ino(BTRFS_I(dir)),
+			   __func__, name.name, btrfs_ino(BTRFS_I(dir)),
 			   location->objectid, location->type, location->offset);
 	}
 	if (!ret)
 		*type = btrfs_dir_type(path->nodes[0], di);
 out:
+	fscrypt_free_filename(&fname);
 	btrfs_free_path(path);
 	return ret;
 }
@@ -5582,6 +5630,14 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
 	struct btrfs_key key;
 	int ret;
 	int err = 0;
+	struct fscrypt_name fname;
+	struct qstr name;
+
+	ret = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname);
+	if (ret)
+		return ret;
+
+	name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name);
 
 	path = btrfs_alloc_path();
 	if (!path) {
@@ -5604,12 +5660,11 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
 	leaf = path->nodes[0];
 	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
 	if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(BTRFS_I(dir)) ||
-	    btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
+	    btrfs_root_ref_name_len(leaf, ref) != name.len)
 		goto out;
 
-	ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
-				   (unsigned long)(ref + 1),
-				   dentry->d_name.len);
+	ret = memcmp_extent_buffer(leaf, name.name, (unsigned long)(ref + 1),
+				   name.len);
 	if (ret)
 		goto out;
 
@@ -5628,6 +5683,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
 	err = 0;
 out:
 	btrfs_free_path(path);
+	fscrypt_free_filename(&fname);
 	return err;
 }
 
@@ -6236,9 +6292,19 @@ int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
 	struct inode *inode = args->inode;
 	int ret;
 
+	if (!args->orphan) {
+		ret = fscrypt_setup_filename(dir, &args->dentry->d_name, 0,
+					     &args->fname);
+		if (ret)
+			return ret;
+		args->name = (struct qstr)FSTR_TO_QSTR(&args->fname.disk_name);
+	}
+
 	ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl);
-	if (ret)
+	if (ret) {
+		fscrypt_free_filename(&args->fname);
 		return ret;
+	}
 
 	/* 1 to add inode item */
 	*trans_num_items = 1;
@@ -6278,6 +6344,7 @@ void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args)
 {
 	posix_acl_release(args->acl);
 	posix_acl_release(args->default_acl);
+	fscrypt_free_filename(&args->fname);
 }
 
 /*
@@ -6703,6 +6770,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct inode *inode = d_inode(old_dentry);
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct fscrypt_name fname;
+	struct qstr name;
 	u64 index;
 	int err;
 	int drop_inode = 0;
@@ -6714,6 +6783,12 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	if (inode->i_nlink >= BTRFS_LINK_MAX)
 		return -EMLINK;
 
+	err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname);
+	if (err)
+		goto fail;
+
+	name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name);
+
 	err = btrfs_set_inode_index(BTRFS_I(dir), &index);
 	if (err)
 		goto fail;
@@ -6740,7 +6815,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
 
 	err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
-			     &dentry->d_name, 1, index);
+			     &name, 1, index);
 
 	if (err) {
 		drop_inode = 1;
@@ -6764,6 +6839,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	}
 
 fail:
+	fscrypt_free_filename(&fname);
 	if (trans)
 		btrfs_end_transaction(trans);
 	if (drop_inode) {
@@ -9003,6 +9079,8 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 	int ret;
 	int ret2;
 	bool need_abort = false;
+	struct fscrypt_name old_fname, new_fname;
+	struct qstr old_name, new_name;
 
 	/*
 	 * For non-subvolumes allow exchange only within one subvolume, in the
@@ -9014,6 +9092,19 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 	     new_ino != BTRFS_FIRST_FREE_OBJECTID))
 		return -EXDEV;
 
+	ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
+	if (ret)
+		return ret;
+
+	ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
+	if (ret) {
+		fscrypt_free_filename(&old_fname);
+		return ret;
+	}
+
+	old_name = (struct qstr)FSTR_TO_QSTR(&old_fname.disk_name);
+	new_name = (struct qstr)FSTR_TO_QSTR(&new_fname.disk_name);
+
 	/* close the race window with snapshot create/destroy ioctl */
 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
 	    new_ino == BTRFS_FIRST_FREE_OBJECTID)
@@ -9081,8 +9172,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 		/* force full log commit if subvolume involved. */
 		btrfs_set_log_full_commit(trans);
 	} else {
-		ret = btrfs_insert_inode_ref(trans, dest, &new_dentry->d_name,
-					     old_ino,
+		ret = btrfs_insert_inode_ref(trans, dest, &new_name, old_ino,
 					     btrfs_ino(BTRFS_I(new_dir)),
 					     old_idx);
 		if (ret)
@@ -9095,8 +9185,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 		/* force full log commit if subvolume involved. */
 		btrfs_set_log_full_commit(trans);
 	} else {
-		ret = btrfs_insert_inode_ref(trans, root, &old_dentry->d_name,
-					     new_ino,
+		ret = btrfs_insert_inode_ref(trans, root, &old_name, new_ino,
 					     btrfs_ino(BTRFS_I(old_dir)),
 					     new_idx);
 		if (ret) {
@@ -9131,8 +9220,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 	} else { /* src is an inode */
 		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
 					   BTRFS_I(old_dentry->d_inode),
-					   &old_dentry->d_name,
-					   &old_rename_ctx);
+					   &old_name, &old_rename_ctx);
 		if (!ret)
 			ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
 	}
@@ -9147,8 +9235,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 	} else { /* dest is an inode */
 		ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
 					   BTRFS_I(new_dentry->d_inode),
-					   &new_dentry->d_name,
-					   &new_rename_ctx);
+					   &new_name, &new_rename_ctx);
 		if (!ret)
 			ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode));
 	}
@@ -9158,14 +9245,14 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 	}
 
 	ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
-			     &new_dentry->d_name, 0, old_idx);
+			     &new_name, 0, old_idx);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		goto out_fail;
 	}
 
 	ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
-			     &old_dentry->d_name, 0, new_idx);
+			     &old_name, 0, new_idx);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		goto out_fail;
@@ -9208,6 +9295,8 @@ out_notrans:
 	    old_ino == BTRFS_FIRST_FREE_OBJECTID)
 		up_read(&fs_info->subvol_sem);
 
+	fscrypt_free_filename(&new_fname);
+	fscrypt_free_filename(&old_fname);
 	return ret;
 }
 
@@ -9247,6 +9336,8 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
 	int ret;
 	int ret2;
 	u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
+	struct fscrypt_name old_fname, new_fname;
+	struct qstr old_name, new_name;
 
 	if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
 		return -EPERM;
@@ -9263,21 +9354,32 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
 	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
 		return -ENOTEMPTY;
 
+	ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
+	if (ret)
+		return ret;
+
+	ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
+	if (ret) {
+		fscrypt_free_filename(&old_fname);
+		return ret;
+	}
+
+	old_name = (struct qstr)FSTR_TO_QSTR(&old_fname.disk_name);
+	new_name = (struct qstr)FSTR_TO_QSTR(&new_fname.disk_name);
 
 	/* check for collisions, even if the  name isn't there */
-	ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino,
-					     &new_dentry->d_name);
+	ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, &new_name);
 
 	if (ret) {
 		if (ret == -EEXIST) {
 			/* we shouldn't get
 			 * eexist without a new_inode */
 			if (WARN_ON(!new_inode)) {
-				return ret;
+				goto out_fscrypt_names;
 			}
 		} else {
 			/* maybe -EOVERFLOW */
-			return ret;
+			goto out_fscrypt_names;
 		}
 	}
 	ret = 0;
@@ -9360,8 +9462,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
 		/* force full log commit if subvolume involved. */
 		btrfs_set_log_full_commit(trans);
 	} else {
-		ret = btrfs_insert_inode_ref(trans, dest, &new_dentry->d_name,
-					     old_ino,
+		ret = btrfs_insert_inode_ref(trans, dest, &new_name, old_ino,
 					     btrfs_ino(BTRFS_I(new_dir)), index);
 		if (ret)
 			goto out_fail;
@@ -9385,7 +9486,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
 	} else {
 		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
 					   BTRFS_I(d_inode(old_dentry)),
-					   &old_dentry->d_name, &rename_ctx);
+					   &old_name, &rename_ctx);
 		if (!ret)
 			ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
 	}
@@ -9404,7 +9505,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
 		} else {
 			ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
 						 BTRFS_I(d_inode(new_dentry)),
-						 &new_dentry->d_name);
+						 &new_name);
 		}
 		if (!ret && new_inode->i_nlink == 0)
 			ret = btrfs_orphan_add(trans,
@@ -9416,7 +9517,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
 	}
 
 	ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
-			     &new_dentry->d_name, 0, index);
+			     &new_name, 0, index);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		goto out_fail;
@@ -9451,6 +9552,9 @@ out_notrans:
 out_whiteout_inode:
 	if (flags & RENAME_WHITEOUT)
 		iput(whiteout_args.inode);
+out_fscrypt_names:
+	fscrypt_free_filename(&old_fname);
+	fscrypt_free_filename(&new_fname);
 	return ret;
 }
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 7db9612f4d0e..3751b155e361 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -6,6 +6,7 @@
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/writeback.h>
 #include <linux/pagemap.h>
 #include <linux/blkdev.h>
@@ -1611,10 +1612,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	struct btrfs_root *root = pending->root;
 	struct btrfs_root *parent_root;
 	struct btrfs_block_rsv *rsv;
-	struct inode *parent_inode;
+	struct inode *parent_inode = pending->dir;
 	struct btrfs_path *path;
 	struct btrfs_dir_item *dir_item;
-	struct dentry *dentry;
 	struct extent_buffer *tmp;
 	struct extent_buffer *old;
 	struct timespec64 cur_time;
@@ -1623,6 +1623,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	u64 index = 0;
 	u64 objectid;
 	u64 root_flags;
+	unsigned int nofs_flags;
+	struct fscrypt_name fname;
+	struct qstr name;
 
 	ASSERT(pending->path);
 	path = pending->path;
@@ -1630,9 +1633,23 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	ASSERT(pending->root_item);
 	new_root_item = pending->root_item;
 
+	/*
+	 * We're inside a transaction and must make sure that any potential
+	 * allocations with GFP_KERNEL in fscrypt won't recurse back to
+	 * filesystem.
+	 */
+	nofs_flags = memalloc_nofs_save();
+	pending->error = fscrypt_setup_filename(parent_inode,
+						&pending->dentry->d_name, 0,
+						&fname);
+	memalloc_nofs_restore(nofs_flags);
+	if (pending->error)
+		goto free_pending;
+	name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name);
+
 	pending->error = btrfs_get_free_objectid(tree_root, &objectid);
 	if (pending->error)
-		goto no_free_objectid;
+		goto free_fname;
 
 	/*
 	 * Make qgroup to skip current new snapshot's qgroupid, as it is
@@ -1661,8 +1678,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	trace_btrfs_space_reservation(fs_info, "transaction",
 				      trans->transid,
 				      trans->bytes_reserved, 1);
-	dentry = pending->dentry;
-	parent_inode = pending->dir;
 	parent_root = BTRFS_I(parent_inode)->root;
 	ret = record_root_in_trans(trans, parent_root, 0);
 	if (ret)
@@ -1678,7 +1693,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	/* check if there is a file/dir which has the same name. */
 	dir_item = btrfs_lookup_dir_item(NULL, parent_root, path,
 					 btrfs_ino(BTRFS_I(parent_inode)),
-					 &dentry->d_name, 0);
+					 &name, 0);
 	if (dir_item != NULL && !IS_ERR(dir_item)) {
 		pending->error = -EEXIST;
 		goto dir_item_existed;
@@ -1773,7 +1788,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	ret = btrfs_add_root_ref(trans, objectid,
 				 parent_root->root_key.objectid,
 				 btrfs_ino(BTRFS_I(parent_inode)), index,
-				 &dentry->d_name);
+				 &name);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		goto fail;
@@ -1805,9 +1820,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	if (ret < 0)
 		goto fail;
 
-	ret = btrfs_insert_dir_item(trans, &dentry->d_name,
-				    BTRFS_I(parent_inode), &key, BTRFS_FT_DIR,
-				    index);
+	ret = btrfs_insert_dir_item(trans, &name, BTRFS_I(parent_inode), &key,
+				    BTRFS_FT_DIR, index);
 	/* We have check then name at the beginning, so it is impossible. */
 	BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
 	if (ret) {
@@ -1816,7 +1830,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	}
 
 	btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size +
-					 dentry->d_name.len * 2);
+						  name.len * 2);
 	parent_inode->i_mtime = current_time(parent_inode);
 	parent_inode->i_ctime = parent_inode->i_mtime;
 	ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode));
@@ -1848,7 +1862,9 @@ dir_item_existed:
 	trans->bytes_reserved = 0;
 clear_skip_qgroup:
 	btrfs_clear_skip_qgroup(trans);
-no_free_objectid:
+free_fname:
+	fscrypt_free_filename(&fname);
+free_pending:
 	kfree(new_root_item);
 	pending->root_item = NULL;
 	btrfs_free_path(path);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index b14ac7017f41..3f3d10a2dd0e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -7446,9 +7446,16 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
 	if (old_dir && old_dir->logged_trans == trans->transid) {
 		struct btrfs_root *log = old_dir->root->log_root;
 		struct btrfs_path *path;
+		struct fscrypt_name fname;
+		struct qstr name;
 
 		ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX);
 
+		ret = fscrypt_setup_filename(&old_dir->vfs_inode,
+					     &old_dentry->d_name, 0, &fname);
+		if (ret)
+			goto out;
+		name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name);
 		/*
 		 * We have two inodes to update in the log, the old directory and
 		 * the inode that got renamed, so we must pin the log to prevent
@@ -7468,6 +7475,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
 		path = btrfs_alloc_path();
 		if (!path) {
 			ret = -ENOMEM;
+			fscrypt_free_filename(&fname);
 			goto out;
 		}
 
@@ -7483,7 +7491,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
 		 */
 		mutex_lock(&old_dir->log_mutex);
 		ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir),
-					&old_dentry->d_name, old_dir_index);
+					&name, old_dir_index);
 		if (ret > 0) {
 			/*
 			 * The dentry does not exist in the log, so record its
@@ -7497,6 +7505,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
 		mutex_unlock(&old_dir->log_mutex);
 
 		btrfs_free_path(path);
+		fscrypt_free_filename(&fname);
 		if (ret < 0)
 			goto out;
 	}
-- 
cgit 


From 6db75318823a169e836a478ca57d6a7c0a156b77 Mon Sep 17 00:00:00 2001
From: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Date: Thu, 20 Oct 2022 12:58:27 -0400
Subject: btrfs: use struct fscrypt_str instead of struct qstr

While struct qstr is more natural without fscrypt, since it's provided
by dentries, struct fscrypt_str is provided by the fscrypt handlers
processing dentries, and is thus more natural in the fscrypt world.
Replace all of the struct qstr uses with struct fscrypt_str.

Signed-off-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h       | 19 ++++++-----
 fs/btrfs/dir-item.c    | 10 +++---
 fs/btrfs/inode-item.c  | 14 ++++----
 fs/btrfs/inode-item.h  | 10 +++---
 fs/btrfs/inode.c       | 87 ++++++++++++++++++++------------------------------
 fs/btrfs/ioctl.c       |  4 +--
 fs/btrfs/root-tree.c   |  4 +--
 fs/btrfs/send.c        |  4 +--
 fs/btrfs/super.c       |  2 +-
 fs/btrfs/transaction.c | 13 ++++----
 fs/btrfs/tree-log.c    | 42 ++++++++++++------------
 fs/btrfs/tree-log.h    |  4 +--
 12 files changed, 95 insertions(+), 118 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b2016c14488b..7c710893f4c6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1518,10 +1518,10 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 /* root-item.c */
 int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
 		       u64 ref_id, u64 dirid, u64 sequence,
-		       const struct qstr *name);
+		       const struct fscrypt_str *name);
 int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
 		       u64 ref_id, u64 dirid, u64 *sequence,
-		       const struct qstr *name);
+		       const struct fscrypt_str *name);
 int btrfs_del_root(struct btrfs_trans_handle *trans,
 		   const struct btrfs_key *key);
 int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -1550,23 +1550,23 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info);
 
 /* dir-item.c */
 int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
-			  const struct qstr *name);
+			  const struct fscrypt_str *name);
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
-			  const struct qstr *name, struct btrfs_inode *dir,
+			  const struct fscrypt_str *name, struct btrfs_inode *dir,
 			  struct btrfs_key *location, u8 type, u64 index);
 struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
 					     struct btrfs_root *root,
 					     struct btrfs_path *path, u64 dir,
-					     const struct qstr *name, int mod);
+					     const struct fscrypt_str *name, int mod);
 struct btrfs_dir_item *
 btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root,
 			    struct btrfs_path *path, u64 dir,
-			    u64 index, const struct qstr *name, int mod);
+			    u64 index, const struct fscrypt_str *name, int mod);
 struct btrfs_dir_item *
 btrfs_search_dir_index_item(struct btrfs_root *root,
 			    struct btrfs_path *path, u64 dirid,
-			    const struct qstr *name);
+			    const struct fscrypt_str *name);
 int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      struct btrfs_path *path,
@@ -1647,10 +1647,10 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
 int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index);
 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 		       struct btrfs_inode *dir, struct btrfs_inode *inode,
-		       const struct qstr *name);
+		       const struct fscrypt_str *name);
 int btrfs_add_link(struct btrfs_trans_handle *trans,
 		   struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
-		   const struct qstr *name, int add_backref, u64 index);
+		   const struct fscrypt_str *name, int add_backref, u64 index);
 int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry);
 int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
 			 int front);
@@ -1676,7 +1676,6 @@ struct btrfs_new_inode_args {
 	struct posix_acl *default_acl;
 	struct posix_acl *acl;
 	struct fscrypt_name fname;
-	struct qstr name;
 };
 int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
 			    unsigned int *trans_num_items);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 48a15af4ec57..23777021b331 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -106,7 +106,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
  * Will return 0 or -ENOMEM
  */
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
-			  const struct qstr *name, struct btrfs_inode *dir,
+			  const struct fscrypt_str *name, struct btrfs_inode *dir,
 			  struct btrfs_key *location, u8 type, u64 index)
 {
 	int ret = 0;
@@ -208,7 +208,7 @@ static struct btrfs_dir_item *btrfs_lookup_match_dir(
 struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
 					     struct btrfs_root *root,
 					     struct btrfs_path *path, u64 dir,
-					     const struct qstr *name,
+					     const struct fscrypt_str *name,
 					     int mod)
 {
 	struct btrfs_key key;
@@ -227,7 +227,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
 }
 
 int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
-				   const struct qstr *name)
+				   const struct fscrypt_str *name)
 {
 	int ret;
 	struct btrfs_key key;
@@ -304,7 +304,7 @@ struct btrfs_dir_item *
 btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root,
 			    struct btrfs_path *path, u64 dir,
-			    u64 index, const struct qstr *name, int mod)
+			    u64 index, const struct fscrypt_str *name, int mod)
 {
 	struct btrfs_dir_item *di;
 	struct btrfs_key key;
@@ -323,7 +323,7 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
 
 struct btrfs_dir_item *
 btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path,
-			    u64 dirid, const struct qstr *name)
+			    u64 dirid, const struct fscrypt_str *name)
 {
 	struct btrfs_dir_item *di;
 	struct btrfs_key key;
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 577e39cfc411..d66bdbae5585 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -15,7 +15,7 @@
 
 struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
 						   int slot,
-						   const struct qstr *name)
+						   const struct fscrypt_str *name)
 {
 	struct btrfs_inode_ref *ref;
 	unsigned long ptr;
@@ -42,7 +42,7 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
 
 struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
 		struct extent_buffer *leaf, int slot, u64 ref_objectid,
-		const struct qstr *name)
+		const struct fscrypt_str *name)
 {
 	struct btrfs_inode_extref *extref;
 	unsigned long ptr;
@@ -81,7 +81,7 @@ struct btrfs_inode_extref *
 btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
 			  struct btrfs_path *path,
-			  const struct qstr *name,
+			  const struct fscrypt_str *name,
 			  u64 inode_objectid, u64 ref_objectid, int ins_len,
 			  int cow)
 {
@@ -104,7 +104,7 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
 
 static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root,
-				  const struct qstr *name,
+				  const struct fscrypt_str *name,
 				  u64 inode_objectid, u64 ref_objectid,
 				  u64 *index)
 {
@@ -174,7 +174,7 @@ out:
 }
 
 int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root, const struct qstr *name,
+			struct btrfs_root *root, const struct fscrypt_str *name,
 			u64 inode_objectid, u64 ref_objectid, u64 *index)
 {
 	struct btrfs_path *path;
@@ -251,7 +251,7 @@ out:
  */
 static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root,
-				     const struct qstr *name,
+				     const struct fscrypt_str *name,
 				     u64 inode_objectid, u64 ref_objectid,
 				     u64 index)
 {
@@ -306,7 +306,7 @@ out:
 
 /* Will return 0, -ENOMEM, -EMLINK, or -EEXIST or anything from the CoW path */
 int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, const struct qstr *name,
+			   struct btrfs_root *root, const struct fscrypt_str *name,
 			   u64 inode_objectid, u64 ref_objectid, u64 index)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h
index 3c657c670cfd..b80aeb715701 100644
--- a/fs/btrfs/inode-item.h
+++ b/fs/btrfs/inode-item.h
@@ -64,10 +64,10 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
 			       struct btrfs_truncate_control *control);
 int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, const struct qstr *name,
+			   struct btrfs_root *root, const struct fscrypt_str *name,
 			   u64 inode_objectid, u64 ref_objectid, u64 index);
 int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root, const struct qstr *name,
+			struct btrfs_root *root, const struct fscrypt_str *name,
 			u64 inode_objectid, u64 ref_objectid, u64 *index);
 int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
@@ -80,15 +80,15 @@ struct btrfs_inode_extref *btrfs_lookup_inode_extref(
 			  struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
 			  struct btrfs_path *path,
-			  const struct qstr *name,
+			  const struct fscrypt_str *name,
 			  u64 inode_objectid, u64 ref_objectid, int ins_len,
 			  int cow);
 
 struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
 						   int slot,
-						   const struct qstr *name);
+						   const struct fscrypt_str *name);
 struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
 		struct extent_buffer *leaf, int slot, u64 ref_objectid,
-		const struct qstr *name);
+		const struct fscrypt_str *name);
 
 #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index fa4b101bc1eb..7e76d5e91786 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4286,7 +4286,7 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 				struct btrfs_inode *dir,
 				struct btrfs_inode *inode,
-				const struct qstr *name,
+				const struct fscrypt_str *name,
 				struct btrfs_rename_ctx *rename_ctx)
 {
 	struct btrfs_root *root = dir->root;
@@ -4389,7 +4389,7 @@ out:
 
 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 		       struct btrfs_inode *dir, struct btrfs_inode *inode,
-		       const struct qstr *name)
+		       const struct fscrypt_str *name)
 {
 	int ret;
 
@@ -4430,12 +4430,10 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	struct inode *inode = d_inode(dentry);
 	int ret;
 	struct fscrypt_name fname;
-	struct qstr name;
 
 	ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
 	if (ret)
 		return ret;
-	name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name);
 
 	/* This needs to handle no-key deletions later on */
 
@@ -4449,7 +4447,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 			0);
 
 	ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
-				 &name);
+				 &fname.disk_name);
 	if (ret)
 		goto end_trans;
 
@@ -4476,7 +4474,6 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
 	struct extent_buffer *leaf;
 	struct btrfs_dir_item *di;
 	struct btrfs_key key;
-	struct qstr name;
 	u64 index;
 	int ret;
 	u64 objectid;
@@ -4486,7 +4483,6 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
 	ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
 	if (ret)
 		return ret;
-	name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name);
 
 	/* This needs to handle no-key deletions later on */
 
@@ -4506,7 +4502,8 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 
-	di = btrfs_lookup_dir_item(trans, root, path, dir_ino, &name, -1);
+	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
+				   &fname.disk_name, -1);
 	if (IS_ERR_OR_NULL(di)) {
 		ret = di ? PTR_ERR(di) : -ENOENT;
 		goto out;
@@ -4532,7 +4529,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
 	 * call btrfs_del_root_ref, and it _shouldn't_ fail.
 	 */
 	if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
-		di = btrfs_search_dir_index_item(root, path, dir_ino, &name);
+		di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name);
 		if (IS_ERR_OR_NULL(di)) {
 			if (!di)
 				ret = -ENOENT;
@@ -4549,7 +4546,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
 	} else {
 		ret = btrfs_del_root_ref(trans, objectid,
 					 root->root_key.objectid, dir_ino,
-					 &index, &name);
+					 &index, &fname.disk_name);
 		if (ret) {
 			btrfs_abort_transaction(trans, ret);
 			goto out;
@@ -4562,7 +4559,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 
-	btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name.len * 2);
+	btrfs_i_size_write(BTRFS_I(dir), dir->i_size - fname.disk_name.len * 2);
 	inode_inc_iversion(dir);
 	dir->i_mtime = current_time(dir);
 	dir->i_ctime = dir->i_mtime;
@@ -4585,7 +4582,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
 	struct btrfs_path *path;
 	struct btrfs_dir_item *di;
 	struct btrfs_key key;
-	struct qstr name = QSTR_INIT("default", 7);
+	struct fscrypt_str name = FSTR_INIT("default", 7);
 	u64 dir_id;
 	int ret;
 
@@ -4836,7 +4833,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	struct btrfs_trans_handle *trans;
 	u64 last_unlink_trans;
 	struct fscrypt_name fname;
-	struct qstr name;
 
 	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
 		return -ENOTEMPTY;
@@ -4852,7 +4848,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
 	if (err)
 		return err;
-	name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name);
 
 	/* This needs to handle no-key deletions later on */
 
@@ -4875,7 +4870,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 
 	/* now the directory is empty */
 	err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
-				 &name);
+				 &fname.disk_name);
 	if (!err) {
 		btrfs_i_size_write(BTRFS_I(inode), 0);
 		/*
@@ -5569,7 +5564,6 @@ no_delete:
 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 			       struct btrfs_key *location, u8 *type)
 {
-	struct qstr name;
 	struct btrfs_dir_item *di;
 	struct btrfs_path *path;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
@@ -5584,12 +5578,10 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 	if (ret)
 		goto out;
 
-	name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name);
-
 	/* This needs to handle no-key deletions later on */
 
 	di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)),
-				   &name, 0);
+				   &fname.disk_name, 0);
 	if (IS_ERR_OR_NULL(di)) {
 		ret = di ? PTR_ERR(di) : -ENOENT;
 		goto out;
@@ -5601,7 +5593,7 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 		ret = -EUCLEAN;
 		btrfs_warn(root->fs_info,
 "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
-			   __func__, name.name, btrfs_ino(BTRFS_I(dir)),
+			   __func__, fname.disk_name.name, btrfs_ino(BTRFS_I(dir)),
 			   location->objectid, location->type, location->offset);
 	}
 	if (!ret)
@@ -5631,14 +5623,11 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
 	int ret;
 	int err = 0;
 	struct fscrypt_name fname;
-	struct qstr name;
 
 	ret = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname);
 	if (ret)
 		return ret;
 
-	name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name);
-
 	path = btrfs_alloc_path();
 	if (!path) {
 		err = -ENOMEM;
@@ -5660,11 +5649,11 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
 	leaf = path->nodes[0];
 	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
 	if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(BTRFS_I(dir)) ||
-	    btrfs_root_ref_name_len(leaf, ref) != name.len)
+	    btrfs_root_ref_name_len(leaf, ref) != fname.disk_name.len)
 		goto out;
 
-	ret = memcmp_extent_buffer(leaf, name.name, (unsigned long)(ref + 1),
-				   name.len);
+	ret = memcmp_extent_buffer(leaf, fname.disk_name.name,
+				   (unsigned long)(ref + 1), fname.disk_name.len);
 	if (ret)
 		goto out;
 
@@ -6297,7 +6286,6 @@ int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
 					     &args->fname);
 		if (ret)
 			return ret;
-		args->name = (struct qstr)FSTR_TO_QSTR(&args->fname.disk_name);
 	}
 
 	ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl);
@@ -6380,7 +6368,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
 {
 	struct inode *dir = args->dir;
 	struct inode *inode = args->inode;
-	const struct qstr *name = args->orphan ? NULL : &args->dentry->d_name;
+	const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name;
 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
 	struct btrfs_root *root;
 	struct btrfs_inode_item *inode_item;
@@ -6615,7 +6603,7 @@ out:
  */
 int btrfs_add_link(struct btrfs_trans_handle *trans,
 		   struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
-		   const struct qstr *name, int add_backref, u64 index)
+		   const struct fscrypt_str *name, int add_backref, u64 index)
 {
 	int ret = 0;
 	struct btrfs_key key;
@@ -6771,7 +6759,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	struct inode *inode = d_inode(old_dentry);
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct fscrypt_name fname;
-	struct qstr name;
 	u64 index;
 	int err;
 	int drop_inode = 0;
@@ -6787,8 +6774,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	if (err)
 		goto fail;
 
-	name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name);
-
 	err = btrfs_set_inode_index(BTRFS_I(dir), &index);
 	if (err)
 		goto fail;
@@ -6815,7 +6800,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
 
 	err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
-			     &name, 1, index);
+			     &fname.disk_name, 1, index);
 
 	if (err) {
 		drop_inode = 1;
@@ -9080,7 +9065,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 	int ret2;
 	bool need_abort = false;
 	struct fscrypt_name old_fname, new_fname;
-	struct qstr old_name, new_name;
+	struct fscrypt_str *old_name, *new_name;
 
 	/*
 	 * For non-subvolumes allow exchange only within one subvolume, in the
@@ -9102,8 +9087,8 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 		return ret;
 	}
 
-	old_name = (struct qstr)FSTR_TO_QSTR(&old_fname.disk_name);
-	new_name = (struct qstr)FSTR_TO_QSTR(&new_fname.disk_name);
+	old_name = &old_fname.disk_name;
+	new_name = &new_fname.disk_name;
 
 	/* close the race window with snapshot create/destroy ioctl */
 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
@@ -9172,7 +9157,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 		/* force full log commit if subvolume involved. */
 		btrfs_set_log_full_commit(trans);
 	} else {
-		ret = btrfs_insert_inode_ref(trans, dest, &new_name, old_ino,
+		ret = btrfs_insert_inode_ref(trans, dest, new_name, old_ino,
 					     btrfs_ino(BTRFS_I(new_dir)),
 					     old_idx);
 		if (ret)
@@ -9185,7 +9170,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 		/* force full log commit if subvolume involved. */
 		btrfs_set_log_full_commit(trans);
 	} else {
-		ret = btrfs_insert_inode_ref(trans, root, &old_name, new_ino,
+		ret = btrfs_insert_inode_ref(trans, root, old_name, new_ino,
 					     btrfs_ino(BTRFS_I(old_dir)),
 					     new_idx);
 		if (ret) {
@@ -9220,7 +9205,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 	} else { /* src is an inode */
 		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
 					   BTRFS_I(old_dentry->d_inode),
-					   &old_name, &old_rename_ctx);
+					   old_name, &old_rename_ctx);
 		if (!ret)
 			ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
 	}
@@ -9235,7 +9220,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 	} else { /* dest is an inode */
 		ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
 					   BTRFS_I(new_dentry->d_inode),
-					   &new_name, &new_rename_ctx);
+					   new_name, &new_rename_ctx);
 		if (!ret)
 			ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode));
 	}
@@ -9245,14 +9230,14 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 	}
 
 	ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
-			     &new_name, 0, old_idx);
+			     new_name, 0, old_idx);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		goto out_fail;
 	}
 
 	ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
-			     &old_name, 0, new_idx);
+			     old_name, 0, new_idx);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		goto out_fail;
@@ -9337,7 +9322,6 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
 	int ret2;
 	u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
 	struct fscrypt_name old_fname, new_fname;
-	struct qstr old_name, new_name;
 
 	if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
 		return -EPERM;
@@ -9364,12 +9348,8 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
 		return ret;
 	}
 
-	old_name = (struct qstr)FSTR_TO_QSTR(&old_fname.disk_name);
-	new_name = (struct qstr)FSTR_TO_QSTR(&new_fname.disk_name);
-
 	/* check for collisions, even if the  name isn't there */
-	ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, &new_name);
-
+	ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, &new_fname.disk_name);
 	if (ret) {
 		if (ret == -EEXIST) {
 			/* we shouldn't get
@@ -9462,8 +9442,9 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
 		/* force full log commit if subvolume involved. */
 		btrfs_set_log_full_commit(trans);
 	} else {
-		ret = btrfs_insert_inode_ref(trans, dest, &new_name, old_ino,
-					     btrfs_ino(BTRFS_I(new_dir)), index);
+		ret = btrfs_insert_inode_ref(trans, dest, &new_fname.disk_name,
+					     old_ino, btrfs_ino(BTRFS_I(new_dir)),
+					     index);
 		if (ret)
 			goto out_fail;
 	}
@@ -9486,7 +9467,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
 	} else {
 		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
 					   BTRFS_I(d_inode(old_dentry)),
-					   &old_name, &rename_ctx);
+					   &old_fname.disk_name, &rename_ctx);
 		if (!ret)
 			ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
 	}
@@ -9505,7 +9486,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
 		} else {
 			ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
 						 BTRFS_I(d_inode(new_dentry)),
-						 &new_name);
+						 &new_fname.disk_name);
 		}
 		if (!ret && new_inode->i_nlink == 0)
 			ret = btrfs_orphan_add(trans,
@@ -9517,7 +9498,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
 	}
 
 	ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
-			     &new_name, 0, index);
+			     &new_fname.disk_name, 0, index);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		goto out_fail;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index fde55d0f61b4..2132e3e15986 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -951,7 +951,7 @@ static noinline int btrfs_mksubvol(const struct path *parent,
 	struct inode *dir = d_inode(parent->dentry);
 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
 	struct dentry *dentry;
-	struct qstr name_str = QSTR_INIT(name, namelen);
+	struct fscrypt_str name_str = FSTR_INIT((char *)name, namelen);
 	int error;
 
 	error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
@@ -3779,7 +3779,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_path *path = NULL;
 	struct btrfs_disk_key disk_key;
-	struct qstr name = QSTR_INIT("default", 7);
+	struct fscrypt_str name = FSTR_INIT("default", 7);
 	u64 objectid = 0;
 	u64 dir_id;
 	int ret;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 848a720747af..6aab98114253 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -331,7 +331,7 @@ out:
 
 int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
 		       u64 ref_id, u64 dirid, u64 *sequence,
-		       const struct qstr *name)
+		       const struct fscrypt_str *name)
 {
 	struct btrfs_root *tree_root = trans->fs_info->tree_root;
 	struct btrfs_path *path;
@@ -403,7 +403,7 @@ out:
  */
 int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
 		       u64 ref_id, u64 dirid, u64 sequence,
-		       const struct qstr *name)
+		       const struct fscrypt_str *name)
 {
 	struct btrfs_root *tree_root = trans->fs_info->tree_root;
 	struct btrfs_key key;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index d0feeea2ba1c..0ebca9dba5ab 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1597,7 +1597,7 @@ static int gen_unique_name(struct send_ctx *sctx,
 		return -ENOMEM;
 
 	while (1) {
-		struct qstr tmp_name;
+		struct fscrypt_str tmp_name;
 
 		len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu",
 				ino, gen, idx);
@@ -1757,7 +1757,7 @@ static int lookup_dir_item_inode(struct btrfs_root *root,
 	struct btrfs_dir_item *di;
 	struct btrfs_key key;
 	struct btrfs_path *path;
-	struct qstr name_str = QSTR_INIT(name, name_len);
+	struct fscrypt_str name_str = FSTR_INIT((char *)name, name_len);
 
 	path = alloc_path_for_send();
 	if (!path)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 13d33e5707de..33bf211699f8 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1423,7 +1423,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec
 	struct btrfs_dir_item *di;
 	struct btrfs_path *path;
 	struct btrfs_key location;
-	struct qstr name = QSTR_INIT("default", 7);
+	struct fscrypt_str name = FSTR_INIT("default", 7);
 	u64 dir_id;
 
 	path = btrfs_alloc_path();
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 3751b155e361..0d44d50dc3be 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1625,7 +1625,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	u64 root_flags;
 	unsigned int nofs_flags;
 	struct fscrypt_name fname;
-	struct qstr name;
 
 	ASSERT(pending->path);
 	path = pending->path;
@@ -1645,7 +1644,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	memalloc_nofs_restore(nofs_flags);
 	if (pending->error)
 		goto free_pending;
-	name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name);
 
 	pending->error = btrfs_get_free_objectid(tree_root, &objectid);
 	if (pending->error)
@@ -1693,7 +1691,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	/* check if there is a file/dir which has the same name. */
 	dir_item = btrfs_lookup_dir_item(NULL, parent_root, path,
 					 btrfs_ino(BTRFS_I(parent_inode)),
-					 &name, 0);
+					 &fname.disk_name, 0);
 	if (dir_item != NULL && !IS_ERR(dir_item)) {
 		pending->error = -EEXIST;
 		goto dir_item_existed;
@@ -1788,7 +1786,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	ret = btrfs_add_root_ref(trans, objectid,
 				 parent_root->root_key.objectid,
 				 btrfs_ino(BTRFS_I(parent_inode)), index,
-				 &name);
+				 &fname.disk_name);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		goto fail;
@@ -1820,8 +1818,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	if (ret < 0)
 		goto fail;
 
-	ret = btrfs_insert_dir_item(trans, &name, BTRFS_I(parent_inode), &key,
-				    BTRFS_FT_DIR, index);
+	ret = btrfs_insert_dir_item(trans, &fname.disk_name,
+				    BTRFS_I(parent_inode), &key, BTRFS_FT_DIR,
+				    index);
 	/* We have check then name at the beginning, so it is impossible. */
 	BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
 	if (ret) {
@@ -1830,7 +1829,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	}
 
 	btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size +
-						  name.len * 2);
+						  fname.disk_name.len * 2);
 	parent_inode->i_mtime = current_time(parent_inode);
 	parent_inode->i_ctime = parent_inode->i_mtime;
 	ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode));
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 3f3d10a2dd0e..7002cc3315da 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -598,7 +598,7 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
 }
 
 static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len,
-			       struct qstr *name)
+			       struct fscrypt_str *name)
 {
 	char *buf;
 
@@ -917,7 +917,7 @@ out:
 static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans,
 				       struct btrfs_inode *dir,
 				       struct btrfs_inode *inode,
-				       const struct qstr *name)
+				       const struct fscrypt_str *name)
 {
 	int ret;
 
@@ -948,7 +948,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_root *root = dir->root;
 	struct inode *inode;
-	struct qstr name;
+	struct fscrypt_str name;
 	struct extent_buffer *leaf;
 	struct btrfs_key location;
 	int ret;
@@ -989,7 +989,7 @@ out:
 static noinline int inode_in_dir(struct btrfs_root *root,
 				 struct btrfs_path *path,
 				 u64 dirid, u64 objectid, u64 index,
-				 struct qstr *name)
+				 struct fscrypt_str *name)
 {
 	struct btrfs_dir_item *di;
 	struct btrfs_key location;
@@ -1036,7 +1036,7 @@ out:
 static noinline int backref_in_log(struct btrfs_root *log,
 				   struct btrfs_key *key,
 				   u64 ref_objectid,
-				   const struct qstr *name)
+				   const struct fscrypt_str *name)
 {
 	struct btrfs_path *path;
 	int ret;
@@ -1072,7 +1072,7 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
 				  struct btrfs_inode *dir,
 				  struct btrfs_inode *inode,
 				  u64 inode_objectid, u64 parent_objectid,
-				  u64 ref_index, struct qstr *name)
+				  u64 ref_index, struct fscrypt_str *name)
 {
 	int ret;
 	struct extent_buffer *leaf;
@@ -1106,7 +1106,7 @@ again:
 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
 		ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]);
 		while (ptr < ptr_end) {
-			struct qstr victim_name;
+			struct fscrypt_str victim_name;
 
 			victim_ref = (struct btrfs_inode_ref *)ptr;
 			ret = read_alloc_one_name(leaf, (victim_ref + 1),
@@ -1156,7 +1156,7 @@ again:
 		base = btrfs_item_ptr_offset(leaf, path->slots[0]);
 
 		while (cur_offset < item_size) {
-			struct qstr victim_name;
+			struct fscrypt_str victim_name;
 
 			extref = (struct btrfs_inode_extref *)(base + cur_offset);
 
@@ -1231,7 +1231,7 @@ next:
 }
 
 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
-			     struct qstr *name, u64 *index,
+			     struct fscrypt_str *name, u64 *index,
 			     u64 *parent_objectid)
 {
 	struct btrfs_inode_extref *extref;
@@ -1253,7 +1253,7 @@ static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
 }
 
 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
-			  struct qstr *name, u64 *index)
+			  struct fscrypt_str *name, u64 *index)
 {
 	struct btrfs_inode_ref *ref;
 	int ret;
@@ -1305,7 +1305,7 @@ again:
 	ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
 	ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]);
 	while (ref_ptr < ref_end) {
-		struct qstr name;
+		struct fscrypt_str name;
 		u64 parent_id;
 
 		if (key->type == BTRFS_INODE_EXTREF_KEY) {
@@ -1373,7 +1373,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 	struct inode *inode = NULL;
 	unsigned long ref_ptr;
 	unsigned long ref_end;
-	struct qstr name;
+	struct fscrypt_str name;
 	int ret;
 	int log_ref_ver = 0;
 	u64 parent_objectid;
@@ -1767,7 +1767,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
 static noinline int insert_one_name(struct btrfs_trans_handle *trans,
 				    struct btrfs_root *root,
 				    u64 dirid, u64 index,
-				    const struct qstr *name,
+				    const struct fscrypt_str *name,
 				    struct btrfs_key *location)
 {
 	struct inode *inode;
@@ -1845,7 +1845,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 				    struct btrfs_dir_item *di,
 				    struct btrfs_key *key)
 {
-	struct qstr name;
+	struct fscrypt_str name;
 	struct btrfs_dir_item *dir_dst_di;
 	struct btrfs_dir_item *index_dst_di;
 	bool dir_dst_matches = false;
@@ -2125,7 +2125,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
 	struct extent_buffer *eb;
 	int slot;
 	struct btrfs_dir_item *di;
-	struct qstr name;
+	struct fscrypt_str name;
 	struct inode *inode = NULL;
 	struct btrfs_key location;
 
@@ -3423,7 +3423,7 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *log,
 			     struct btrfs_path *path,
 			     u64 dir_ino,
-			     const struct qstr *name,
+			     const struct fscrypt_str *name,
 			     u64 index)
 {
 	struct btrfs_dir_item *di;
@@ -3470,7 +3470,7 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans,
  */
 void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root,
-				  const struct qstr *name,
+				  const struct fscrypt_str *name,
 				  struct btrfs_inode *dir, u64 index)
 {
 	struct btrfs_path *path;
@@ -3509,7 +3509,7 @@ out_unlock:
 /* see comments for btrfs_del_dir_entries_in_log */
 void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
-				const struct qstr *name,
+				const struct fscrypt_str *name,
 				struct btrfs_inode *inode, u64 dirid)
 {
 	struct btrfs_root *log;
@@ -5244,7 +5244,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
 		u32 this_len;
 		unsigned long name_ptr;
 		struct btrfs_dir_item *di;
-		struct qstr name_str;
+		struct fscrypt_str name_str;
 
 		if (key->type == BTRFS_INODE_REF_KEY) {
 			struct btrfs_inode_ref *iref;
@@ -7447,7 +7447,6 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
 		struct btrfs_root *log = old_dir->root->log_root;
 		struct btrfs_path *path;
 		struct fscrypt_name fname;
-		struct qstr name;
 
 		ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX);
 
@@ -7455,7 +7454,6 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
 					     &old_dentry->d_name, 0, &fname);
 		if (ret)
 			goto out;
-		name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name);
 		/*
 		 * We have two inodes to update in the log, the old directory and
 		 * the inode that got renamed, so we must pin the log to prevent
@@ -7491,7 +7489,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
 		 */
 		mutex_lock(&old_dir->log_mutex);
 		ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir),
-					&name, old_dir_index);
+					&fname.disk_name, old_dir_index);
 		if (ret > 0) {
 			/*
 			 * The dentry does not exist in the log, so record its
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 8c9a387151b0..85b43075ac58 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -87,11 +87,11 @@ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
 			  struct btrfs_log_ctx *ctx);
 void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root,
-				  const struct qstr *name,
+				  const struct fscrypt_str *name,
 				  struct btrfs_inode *dir, u64 index);
 void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
-				const struct qstr *name,
+				const struct fscrypt_str *name,
 				struct btrfs_inode *inode, u64 dirid);
 void btrfs_end_log_trans(struct btrfs_root *root);
 void btrfs_pin_log_trans(struct btrfs_root *root);
-- 
cgit 


From 94a48aef49f235cc1efc74dc18e7708ca3b8d698 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@osandov.com>
Date: Thu, 20 Oct 2022 12:58:28 -0400
Subject: btrfs: extend btrfs_dir_item type to store encryption status

For directories with encrypted files/filenames, we need to store a flag
indicating this fact. There's no room in other fields, so we'll need to
borrow a bit from dir_type. Since it's now a combination of type and
flags, we rename it to dir_flags to reflect its new usage.

The new flag, FT_ENCRYPTED, indicates a directory containing encrypted
data, which is orthogonal to file type; therefore, add the new
flag, and make conversion from directory type to file type strip the
flag.

As the file types almost never change we can afford to use the bits.
Actual usage will be guarded behind an incompat bit, this patch only
adds the support for later use by fscrypt.

Signed-off-by: Omar Sandoval <osandov@osandov.com>
Signed-off-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/accessors.h     | 15 +++++++++++++--
 fs/btrfs/delayed-inode.c |  6 +++---
 fs/btrfs/delayed-inode.h |  2 +-
 fs/btrfs/dir-item.c      |  7 +++++--
 fs/btrfs/inode.c         | 13 +++++++------
 fs/btrfs/print-tree.c    |  4 ++--
 fs/btrfs/send.c          |  2 +-
 fs/btrfs/tree-checker.c  |  2 +-
 fs/btrfs/tree-log.c      | 20 ++++++++++----------
 9 files changed, 43 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index e8bbd4d80161..cb59b69d2af1 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -484,14 +484,25 @@ BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16);
 
 /* struct btrfs_dir_item */
 BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16);
-BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8);
+BTRFS_SETGET_FUNCS(dir_flags, struct btrfs_dir_item, type, 8);
 BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16);
 BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_dir_type, struct btrfs_dir_item, type, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_flags, struct btrfs_dir_item, type, 8);
 BTRFS_SETGET_STACK_FUNCS(stack_dir_data_len, struct btrfs_dir_item, data_len, 16);
 BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item, name_len, 16);
 BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item, transid, 64);
 
+static inline u8 btrfs_dir_ftype(const struct extent_buffer *eb,
+				 const struct btrfs_dir_item *item)
+{
+	return btrfs_dir_flags_to_ftype(btrfs_dir_flags(eb, item));
+}
+
+static inline u8 btrfs_stack_dir_ftype(const struct btrfs_dir_item *item)
+{
+	return btrfs_dir_flags_to_ftype(btrfs_stack_dir_flags(item));
+}
+
 static inline void btrfs_dir_item_key(const struct extent_buffer *eb,
 				      const struct btrfs_dir_item *item,
 				      struct btrfs_disk_key *key)
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 012c96de4701..f93d2695e423 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1416,7 +1416,7 @@ void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info)
 int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
 				   const char *name, int name_len,
 				   struct btrfs_inode *dir,
-				   struct btrfs_disk_key *disk_key, u8 type,
+				   struct btrfs_disk_key *disk_key, u8 flags,
 				   u64 index)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -1447,7 +1447,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
 	btrfs_set_stack_dir_transid(dir_item, trans->transid);
 	btrfs_set_stack_dir_data_len(dir_item, 0);
 	btrfs_set_stack_dir_name_len(dir_item, name_len);
-	btrfs_set_stack_dir_type(dir_item, type);
+	btrfs_set_stack_dir_flags(dir_item, flags);
 	memcpy((char *)(dir_item + 1), name, name_len);
 
 	data_len = delayed_item->data_len + sizeof(struct btrfs_item);
@@ -1757,7 +1757,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
 		name = (char *)(di + 1);
 		name_len = btrfs_stack_dir_name_len(di);
 
-		d_type = fs_ftype_to_dtype(di->type);
+		d_type = fs_ftype_to_dtype(btrfs_dir_flags_to_ftype(di->type));
 		btrfs_disk_key_to_cpu(&location, &di->location);
 
 		over = !dir_emit(ctx, name, name_len,
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 0163ca637a96..4f21daa3dbc7 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -113,7 +113,7 @@ static inline void btrfs_init_delayed_root(
 int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
 				   const char *name, int name_len,
 				   struct btrfs_inode *dir,
-				   struct btrfs_disk_key *disk_key, u8 type,
+				   struct btrfs_disk_key *disk_key, u8 flags,
 				   u64 index);
 
 int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 23777021b331..ca69fb35a2cc 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -83,7 +83,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
 	leaf = path->nodes[0];
 	btrfs_cpu_key_to_disk(&disk_key, &location);
 	btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
-	btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR);
+	btrfs_set_dir_flags(leaf, dir_item, BTRFS_FT_XATTR);
 	btrfs_set_dir_name_len(leaf, dir_item, name_len);
 	btrfs_set_dir_transid(leaf, dir_item, trans->transid);
 	btrfs_set_dir_data_len(leaf, dir_item, data_len);
@@ -140,9 +140,12 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
 		goto out_free;
 	}
 
+	if (IS_ENCRYPTED(&dir->vfs_inode))
+		type |= BTRFS_FT_ENCRYPTED;
+
 	leaf = path->nodes[0];
 	btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
-	btrfs_set_dir_type(leaf, dir_item, type);
+	btrfs_set_dir_flags(leaf, dir_item, type);
 	btrfs_set_dir_data_len(leaf, dir_item, 0);
 	btrfs_set_dir_name_len(leaf, dir_item, name->len);
 	btrfs_set_dir_transid(leaf, dir_item, trans->transid);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7e76d5e91786..78867a084428 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5597,7 +5597,7 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 			   location->objectid, location->type, location->offset);
 	}
 	if (!ret)
-		*type = btrfs_dir_type(path->nodes[0], di);
+		*type = btrfs_dir_ftype(path->nodes[0], di);
 out:
 	fscrypt_free_filename(&fname);
 	btrfs_free_path(path);
@@ -6046,6 +6046,7 @@ again:
 	btrfs_for_each_slot(root, &key, &found_key, path, ret) {
 		struct dir_entry *entry;
 		struct extent_buffer *leaf = path->nodes[0];
+		u8 ftype;
 
 		if (found_key.objectid != key.objectid)
 			break;
@@ -6069,13 +6070,13 @@ again:
 			goto again;
 		}
 
+		ftype = btrfs_dir_flags_to_ftype(btrfs_dir_flags(leaf, di));
 		entry = addr;
-		put_unaligned(name_len, &entry->name_len);
 		name_ptr = (char *)(entry + 1);
-		read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1),
-				   name_len);
-		put_unaligned(fs_ftype_to_dtype(btrfs_dir_type(leaf, di)),
-				&entry->type);
+		read_extent_buffer(leaf, name_ptr,
+				   (unsigned long)(di + 1), name_len);
+		put_unaligned(name_len, &entry->name_len);
+		put_unaligned(fs_ftype_to_dtype(ftype), &entry->type);
 		btrfs_dir_item_key_to_cpu(leaf, di, &location);
 		put_unaligned(location.objectid, &entry->ino);
 		put_unaligned(found_key.offset, &entry->offset);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index aab7d30eed55..1a2350fd68be 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -242,9 +242,9 @@ void btrfs_print_leaf(struct extent_buffer *l)
 		case BTRFS_DIR_ITEM_KEY:
 			di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
 			btrfs_dir_item_key_to_cpu(l, di, &found_key);
-			pr_info("\t\tdir oid %llu type %u\n",
+			pr_info("\t\tdir oid %llu flags %u\n",
 				found_key.objectid,
-				btrfs_dir_type(l, di));
+				btrfs_dir_flags(l, di));
 			break;
 		case BTRFS_ROOT_ITEM_KEY:
 			ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 0ebca9dba5ab..2ece3a030a66 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1094,7 +1094,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
 		data_len = btrfs_dir_data_len(eb, di);
 		btrfs_dir_item_key_to_cpu(eb, di, &di_key);
 
-		if (btrfs_dir_type(eb, di) == BTRFS_FT_XATTR) {
+		if (btrfs_dir_ftype(eb, di) == BTRFS_FT_XATTR) {
 			if (name_len > XATTR_NAME_MAX) {
 				ret = -ENAMETOOLONG;
 				goto out;
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 11cafc520b47..1c2d418dda6a 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -531,7 +531,7 @@ static int check_dir_item(struct extent_buffer *leaf,
 		}
 
 		/* dir type check */
-		dir_type = btrfs_dir_type(leaf, di);
+		dir_type = btrfs_dir_ftype(leaf, di);
 		if (unlikely(dir_type >= BTRFS_FT_MAX)) {
 			dir_item_err(leaf, slot,
 			"invalid dir item type, have %u expect [0, %u)",
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 7002cc3315da..a5e56a678af2 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1799,7 +1799,7 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
 					struct btrfs_path *path,
 					struct btrfs_dir_item *dst_di,
 					const struct btrfs_key *log_key,
-					u8 log_type,
+					u8 log_flags,
 					bool exists)
 {
 	struct btrfs_key found_key;
@@ -1809,7 +1809,7 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
 	if (found_key.objectid == log_key->objectid &&
 	    found_key.type == log_key->type &&
 	    found_key.offset == log_key->offset &&
-	    btrfs_dir_type(path->nodes[0], dst_di) == log_type)
+	    btrfs_dir_flags(path->nodes[0], dst_di) == log_flags)
 		return 1;
 
 	/*
@@ -1853,7 +1853,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 	struct btrfs_key log_key;
 	struct btrfs_key search_key;
 	struct inode *dir;
-	u8 log_type;
+	u8 log_flags;
 	bool exists;
 	int ret;
 	bool update_size = true;
@@ -1867,7 +1867,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 	if (ret)
 		goto out;
 
-	log_type = btrfs_dir_type(eb, di);
+	log_flags = btrfs_dir_flags(eb, di);
 	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
 	ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
 	btrfs_release_path(path);
@@ -1883,8 +1883,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 		goto out;
 	} else if (dir_dst_di) {
 		ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
-						   dir_dst_di, &log_key, log_type,
-						   exists);
+						   dir_dst_di, &log_key,
+						   log_flags, exists);
 		if (ret < 0)
 			goto out;
 		dir_dst_matches = (ret == 1);
@@ -1901,7 +1901,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 	} else if (index_dst_di) {
 		ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
 						   index_dst_di, &log_key,
-						   log_type, exists);
+						   log_flags, exists);
 		if (ret < 0)
 			goto out;
 		index_dst_matches = (ret == 1);
@@ -2010,7 +2010,7 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
 	 * to ever delete the parent directory has it would result in stale
 	 * dentries that can never be deleted.
 	 */
-	if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
+	if (ret == 1 && btrfs_dir_ftype(eb, di) != BTRFS_FT_DIR) {
 		struct btrfs_path *fixup_path;
 		struct btrfs_key di_key;
 
@@ -5452,7 +5452,7 @@ again:
 			}
 
 			di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
-			type = btrfs_dir_type(leaf, di);
+			type = btrfs_dir_ftype(leaf, di);
 			if (btrfs_dir_transid(leaf, di) < trans->transid)
 				continue;
 			btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
@@ -6292,7 +6292,7 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
 			continue;
 		}
 
-		if (btrfs_stack_dir_type(dir_item) == BTRFS_FT_DIR)
+		if (btrfs_stack_dir_ftype(dir_item) == BTRFS_FT_DIR)
 			log_mode = LOG_INODE_ALL;
 
 		ctx->log_new_dentries = false;
-- 
cgit 


From a56159d4080b793ee8dc3d3d315579801ad9096c Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Mon, 24 Oct 2022 14:46:52 -0400
Subject: btrfs: move btrfs_fs_info declarations into fs.h

Now that we have a lot of the fs_info related helpers and stuff
isolated, copy these over to fs.h out of ctree.h.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ reformat comments ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h | 659 +-----------------------------------------------------
 fs/btrfs/fs.h    | 660 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 661 insertions(+), 658 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7c710893f4c6..ace6f41612e5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -36,6 +36,7 @@
 #include "block-rsv.h"
 #include "locking.h"
 #include "misc.h"
+#include "fs.h"
 
 struct btrfs_trans_handle;
 struct btrfs_transaction;
@@ -53,14 +54,6 @@ struct btrfs_balance_control;
 struct btrfs_delayed_root;
 struct reloc_control;
 
-#define BTRFS_OLDEST_GENERATION	0ULL
-
-#define BTRFS_EMPTY_DIR_SIZE 0
-
-#define BTRFS_DIRTY_METADATA_THRESH	SZ_32M
-
-#define BTRFS_MAX_EXTENT_SIZE SZ_128M
-
 static inline unsigned long btrfs_chunk_item_size(int num_stripes)
 {
 	BUG_ON(num_stripes == 0);
@@ -68,17 +61,6 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
 		sizeof(struct btrfs_stripe) * (num_stripes - 1);
 }
 
-#define BTRFS_SUPER_INFO_OFFSET			SZ_64K
-#define BTRFS_SUPER_INFO_SIZE			4096
-static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
-
-/*
- * The reserved space at the beginning of each device.
- * It covers the primary super block and leaves space for potential use by other
- * tools like bootloaders or to lower potential damage of accidental overwrite.
- */
-#define BTRFS_DEVICE_RANGE_RESERVED			(SZ_1M)
-
 /* Read ahead values for struct btrfs_path.reada */
 enum {
 	READA_NONE,
@@ -137,645 +119,6 @@ struct btrfs_path {
 	unsigned int nowait:1;
 };
 
-struct btrfs_dev_replace {
-	u64 replace_state;	/* see #define above */
-	time64_t time_started;	/* seconds since 1-Jan-1970 */
-	time64_t time_stopped;	/* seconds since 1-Jan-1970 */
-	atomic64_t num_write_errors;
-	atomic64_t num_uncorrectable_read_errors;
-
-	u64 cursor_left;
-	u64 committed_cursor_left;
-	u64 cursor_left_last_write_of_item;
-	u64 cursor_right;
-
-	u64 cont_reading_from_srcdev_mode;	/* see #define above */
-
-	int is_valid;
-	int item_needs_writeback;
-	struct btrfs_device *srcdev;
-	struct btrfs_device *tgtdev;
-
-	struct mutex lock_finishing_cancel_unmount;
-	struct rw_semaphore rwsem;
-
-	struct btrfs_scrub_progress scrub_progress;
-
-	struct percpu_counter bio_counter;
-	wait_queue_head_t replace_wait;
-};
-
-/*
- * free clusters are used to claim free space in relatively large chunks,
- * allowing us to do less seeky writes. They are used for all metadata
- * allocations. In ssd_spread mode they are also used for data allocations.
- */
-struct btrfs_free_cluster {
-	spinlock_t lock;
-	spinlock_t refill_lock;
-	struct rb_root root;
-
-	/* largest extent in this cluster */
-	u64 max_size;
-
-	/* first extent starting offset */
-	u64 window_start;
-
-	/* We did a full search and couldn't create a cluster */
-	bool fragmented;
-
-	struct btrfs_block_group *block_group;
-	/*
-	 * when a cluster is allocated from a block group, we put the
-	 * cluster onto a list in the block group so that it can
-	 * be freed before the block group is freed.
-	 */
-	struct list_head block_group_list;
-};
-
-/* Discard control. */
-/*
- * Async discard uses multiple lists to differentiate the discard filter
- * parameters.  Index 0 is for completely free block groups where we need to
- * ensure the entire block group is trimmed without being lossy.  Indices
- * afterwards represent monotonically decreasing discard filter sizes to
- * prioritize what should be discarded next.
- */
-#define BTRFS_NR_DISCARD_LISTS		3
-#define BTRFS_DISCARD_INDEX_UNUSED	0
-#define BTRFS_DISCARD_INDEX_START	1
-
-struct btrfs_discard_ctl {
-	struct workqueue_struct *discard_workers;
-	struct delayed_work work;
-	spinlock_t lock;
-	struct btrfs_block_group *block_group;
-	struct list_head discard_list[BTRFS_NR_DISCARD_LISTS];
-	u64 prev_discard;
-	u64 prev_discard_time;
-	atomic_t discardable_extents;
-	atomic64_t discardable_bytes;
-	u64 max_discard_size;
-	u64 delay_ms;
-	u32 iops_limit;
-	u32 kbps_limit;
-	u64 discard_extent_bytes;
-	u64 discard_bitmap_bytes;
-	atomic64_t discard_bytes_saved;
-};
-
-/*
- * Exclusive operations (device replace, resize, device add/remove, balance)
- */
-enum btrfs_exclusive_operation {
-	BTRFS_EXCLOP_NONE,
-	BTRFS_EXCLOP_BALANCE_PAUSED,
-	BTRFS_EXCLOP_BALANCE,
-	BTRFS_EXCLOP_DEV_ADD,
-	BTRFS_EXCLOP_DEV_REMOVE,
-	BTRFS_EXCLOP_DEV_REPLACE,
-	BTRFS_EXCLOP_RESIZE,
-	BTRFS_EXCLOP_SWAP_ACTIVATE,
-};
-
-/* Store data about transaction commits, exported via sysfs. */
-struct btrfs_commit_stats {
-	/* Total number of commits */
-	u64 commit_count;
-	/* The maximum commit duration so far in ns */
-	u64 max_commit_dur;
-	/* The last commit duration in ns */
-	u64 last_commit_dur;
-	/* The total commit duration in ns */
-	u64 total_commit_dur;
-};
-
-struct btrfs_fs_info {
-	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
-	unsigned long flags;
-	struct btrfs_root *tree_root;
-	struct btrfs_root *chunk_root;
-	struct btrfs_root *dev_root;
-	struct btrfs_root *fs_root;
-	struct btrfs_root *quota_root;
-	struct btrfs_root *uuid_root;
-	struct btrfs_root *data_reloc_root;
-	struct btrfs_root *block_group_root;
-
-	/* the log root tree is a directory of all the other log roots */
-	struct btrfs_root *log_root_tree;
-
-	/* The tree that holds the global roots (csum, extent, etc) */
-	rwlock_t global_root_lock;
-	struct rb_root global_root_tree;
-
-	spinlock_t fs_roots_radix_lock;
-	struct radix_tree_root fs_roots_radix;
-
-	/* block group cache stuff */
-	rwlock_t block_group_cache_lock;
-	struct rb_root_cached block_group_cache_tree;
-
-	/* keep track of unallocated space */
-	atomic64_t free_chunk_space;
-
-	/* Track ranges which are used by log trees blocks/logged data extents */
-	struct extent_io_tree excluded_extents;
-
-	/* logical->physical extent mapping */
-	struct extent_map_tree mapping_tree;
-
-	/*
-	 * block reservation for extent, checksum, root tree and
-	 * delayed dir index item
-	 */
-	struct btrfs_block_rsv global_block_rsv;
-	/* block reservation for metadata operations */
-	struct btrfs_block_rsv trans_block_rsv;
-	/* block reservation for chunk tree */
-	struct btrfs_block_rsv chunk_block_rsv;
-	/* block reservation for delayed operations */
-	struct btrfs_block_rsv delayed_block_rsv;
-	/* block reservation for delayed refs */
-	struct btrfs_block_rsv delayed_refs_rsv;
-
-	struct btrfs_block_rsv empty_block_rsv;
-
-	u64 generation;
-	u64 last_trans_committed;
-	/*
-	 * Generation of the last transaction used for block group relocation
-	 * since the filesystem was last mounted (or 0 if none happened yet).
-	 * Must be written and read while holding btrfs_fs_info::commit_root_sem.
-	 */
-	u64 last_reloc_trans;
-	u64 avg_delayed_ref_runtime;
-
-	/*
-	 * this is updated to the current trans every time a full commit
-	 * is required instead of the faster short fsync log commits
-	 */
-	u64 last_trans_log_full_commit;
-	unsigned long mount_opt;
-
-	unsigned long compress_type:4;
-	unsigned int compress_level;
-	u32 commit_interval;
-	/*
-	 * It is a suggestive number, the read side is safe even it gets a
-	 * wrong number because we will write out the data into a regular
-	 * extent. The write side(mount/remount) is under ->s_umount lock,
-	 * so it is also safe.
-	 */
-	u64 max_inline;
-
-	struct btrfs_transaction *running_transaction;
-	wait_queue_head_t transaction_throttle;
-	wait_queue_head_t transaction_wait;
-	wait_queue_head_t transaction_blocked_wait;
-	wait_queue_head_t async_submit_wait;
-
-	/*
-	 * Used to protect the incompat_flags, compat_flags, compat_ro_flags
-	 * when they are updated.
-	 *
-	 * Because we do not clear the flags for ever, so we needn't use
-	 * the lock on the read side.
-	 *
-	 * We also needn't use the lock when we mount the fs, because
-	 * there is no other task which will update the flag.
-	 */
-	spinlock_t super_lock;
-	struct btrfs_super_block *super_copy;
-	struct btrfs_super_block *super_for_commit;
-	struct super_block *sb;
-	struct inode *btree_inode;
-	struct mutex tree_log_mutex;
-	struct mutex transaction_kthread_mutex;
-	struct mutex cleaner_mutex;
-	struct mutex chunk_mutex;
-
-	/*
-	 * this is taken to make sure we don't set block groups ro after
-	 * the free space cache has been allocated on them
-	 */
-	struct mutex ro_block_group_mutex;
-
-	/* this is used during read/modify/write to make sure
-	 * no two ios are trying to mod the same stripe at the same
-	 * time
-	 */
-	struct btrfs_stripe_hash_table *stripe_hash_table;
-
-	/*
-	 * this protects the ordered operations list only while we are
-	 * processing all of the entries on it.  This way we make
-	 * sure the commit code doesn't find the list temporarily empty
-	 * because another function happens to be doing non-waiting preflush
-	 * before jumping into the main commit.
-	 */
-	struct mutex ordered_operations_mutex;
-
-	struct rw_semaphore commit_root_sem;
-
-	struct rw_semaphore cleanup_work_sem;
-
-	struct rw_semaphore subvol_sem;
-
-	spinlock_t trans_lock;
-	/*
-	 * the reloc mutex goes with the trans lock, it is taken
-	 * during commit to protect us from the relocation code
-	 */
-	struct mutex reloc_mutex;
-
-	struct list_head trans_list;
-	struct list_head dead_roots;
-	struct list_head caching_block_groups;
-
-	spinlock_t delayed_iput_lock;
-	struct list_head delayed_iputs;
-	atomic_t nr_delayed_iputs;
-	wait_queue_head_t delayed_iputs_wait;
-
-	atomic64_t tree_mod_seq;
-
-	/* this protects tree_mod_log and tree_mod_seq_list */
-	rwlock_t tree_mod_log_lock;
-	struct rb_root tree_mod_log;
-	struct list_head tree_mod_seq_list;
-
-	atomic_t async_delalloc_pages;
-
-	/*
-	 * this is used to protect the following list -- ordered_roots.
-	 */
-	spinlock_t ordered_root_lock;
-
-	/*
-	 * all fs/file tree roots in which there are data=ordered extents
-	 * pending writeback are added into this list.
-	 *
-	 * these can span multiple transactions and basically include
-	 * every dirty data page that isn't from nodatacow
-	 */
-	struct list_head ordered_roots;
-
-	struct mutex delalloc_root_mutex;
-	spinlock_t delalloc_root_lock;
-	/* all fs/file tree roots that have delalloc inodes. */
-	struct list_head delalloc_roots;
-
-	/*
-	 * there is a pool of worker threads for checksumming during writes
-	 * and a pool for checksumming after reads.  This is because readers
-	 * can run with FS locks held, and the writers may be waiting for
-	 * those locks.  We don't want ordering in the pending list to cause
-	 * deadlocks, and so the two are serviced separately.
-	 *
-	 * A third pool does submit_bio to avoid deadlocking with the other
-	 * two
-	 */
-	struct btrfs_workqueue *workers;
-	struct btrfs_workqueue *hipri_workers;
-	struct btrfs_workqueue *delalloc_workers;
-	struct btrfs_workqueue *flush_workers;
-	struct workqueue_struct *endio_workers;
-	struct workqueue_struct *endio_meta_workers;
-	struct workqueue_struct *endio_raid56_workers;
-	struct workqueue_struct *rmw_workers;
-	struct workqueue_struct *compressed_write_workers;
-	struct btrfs_workqueue *endio_write_workers;
-	struct btrfs_workqueue *endio_freespace_worker;
-	struct btrfs_workqueue *caching_workers;
-
-	/*
-	 * fixup workers take dirty pages that didn't properly go through
-	 * the cow mechanism and make them safe to write.  It happens
-	 * for the sys_munmap function call path
-	 */
-	struct btrfs_workqueue *fixup_workers;
-	struct btrfs_workqueue *delayed_workers;
-
-	struct task_struct *transaction_kthread;
-	struct task_struct *cleaner_kthread;
-	u32 thread_pool_size;
-
-	struct kobject *space_info_kobj;
-	struct kobject *qgroups_kobj;
-	struct kobject *discard_kobj;
-
-	/* used to keep from writing metadata until there is a nice batch */
-	struct percpu_counter dirty_metadata_bytes;
-	struct percpu_counter delalloc_bytes;
-	struct percpu_counter ordered_bytes;
-	s32 dirty_metadata_batch;
-	s32 delalloc_batch;
-
-	struct list_head dirty_cowonly_roots;
-
-	struct btrfs_fs_devices *fs_devices;
-
-	/*
-	 * The space_info list is effectively read only after initial
-	 * setup.  It is populated at mount time and cleaned up after
-	 * all block groups are removed.  RCU is used to protect it.
-	 */
-	struct list_head space_info;
-
-	struct btrfs_space_info *data_sinfo;
-
-	struct reloc_control *reloc_ctl;
-
-	/* data_alloc_cluster is only used in ssd_spread mode */
-	struct btrfs_free_cluster data_alloc_cluster;
-
-	/* all metadata allocations go through this cluster */
-	struct btrfs_free_cluster meta_alloc_cluster;
-
-	/* auto defrag inodes go here */
-	spinlock_t defrag_inodes_lock;
-	struct rb_root defrag_inodes;
-	atomic_t defrag_running;
-
-	/* Used to protect avail_{data, metadata, system}_alloc_bits */
-	seqlock_t profiles_lock;
-	/*
-	 * these three are in extended format (availability of single
-	 * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
-	 * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits)
-	 */
-	u64 avail_data_alloc_bits;
-	u64 avail_metadata_alloc_bits;
-	u64 avail_system_alloc_bits;
-
-	/* restriper state */
-	spinlock_t balance_lock;
-	struct mutex balance_mutex;
-	atomic_t balance_pause_req;
-	atomic_t balance_cancel_req;
-	struct btrfs_balance_control *balance_ctl;
-	wait_queue_head_t balance_wait_q;
-
-	/* Cancellation requests for chunk relocation */
-	atomic_t reloc_cancel_req;
-
-	u32 data_chunk_allocations;
-	u32 metadata_ratio;
-
-	void *bdev_holder;
-
-	/* private scrub information */
-	struct mutex scrub_lock;
-	atomic_t scrubs_running;
-	atomic_t scrub_pause_req;
-	atomic_t scrubs_paused;
-	atomic_t scrub_cancel_req;
-	wait_queue_head_t scrub_pause_wait;
-	/*
-	 * The worker pointers are NULL iff the refcount is 0, ie. scrub is not
-	 * running.
-	 */
-	refcount_t scrub_workers_refcnt;
-	struct workqueue_struct *scrub_workers;
-	struct workqueue_struct *scrub_wr_completion_workers;
-	struct workqueue_struct *scrub_parity_workers;
-	struct btrfs_subpage_info *subpage_info;
-
-	struct btrfs_discard_ctl discard_ctl;
-
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-	u32 check_integrity_print_mask;
-#endif
-	/* is qgroup tracking in a consistent state? */
-	u64 qgroup_flags;
-
-	/* holds configuration and tracking. Protected by qgroup_lock */
-	struct rb_root qgroup_tree;
-	spinlock_t qgroup_lock;
-
-	/*
-	 * used to avoid frequently calling ulist_alloc()/ulist_free()
-	 * when doing qgroup accounting, it must be protected by qgroup_lock.
-	 */
-	struct ulist *qgroup_ulist;
-
-	/*
-	 * Protect user change for quota operations. If a transaction is needed,
-	 * it must be started before locking this lock.
-	 */
-	struct mutex qgroup_ioctl_lock;
-
-	/* list of dirty qgroups to be written at next commit */
-	struct list_head dirty_qgroups;
-
-	/* used by qgroup for an efficient tree traversal */
-	u64 qgroup_seq;
-
-	/* qgroup rescan items */
-	struct mutex qgroup_rescan_lock; /* protects the progress item */
-	struct btrfs_key qgroup_rescan_progress;
-	struct btrfs_workqueue *qgroup_rescan_workers;
-	struct completion qgroup_rescan_completion;
-	struct btrfs_work qgroup_rescan_work;
-	bool qgroup_rescan_running;	/* protected by qgroup_rescan_lock */
-	u8 qgroup_drop_subtree_thres;
-
-	/* filesystem state */
-	unsigned long fs_state;
-
-	struct btrfs_delayed_root *delayed_root;
-
-	/* Extent buffer radix tree */
-	spinlock_t buffer_lock;
-	/* Entries are eb->start / sectorsize */
-	struct radix_tree_root buffer_radix;
-
-	/* next backup root to be overwritten */
-	int backup_root_index;
-
-	/* device replace state */
-	struct btrfs_dev_replace dev_replace;
-
-	struct semaphore uuid_tree_rescan_sem;
-
-	/* Used to reclaim the metadata space in the background. */
-	struct work_struct async_reclaim_work;
-	struct work_struct async_data_reclaim_work;
-	struct work_struct preempt_reclaim_work;
-
-	/* Reclaim partially filled block groups in the background */
-	struct work_struct reclaim_bgs_work;
-	struct list_head reclaim_bgs;
-	int bg_reclaim_threshold;
-
-	spinlock_t unused_bgs_lock;
-	struct list_head unused_bgs;
-	struct mutex unused_bg_unpin_mutex;
-	/* Protect block groups that are going to be deleted */
-	struct mutex reclaim_bgs_lock;
-
-	/* Cached block sizes */
-	u32 nodesize;
-	u32 sectorsize;
-	/* ilog2 of sectorsize, use to avoid 64bit division */
-	u32 sectorsize_bits;
-	u32 csum_size;
-	u32 csums_per_leaf;
-	u32 stripesize;
-
-	/*
-	 * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular
-	 * filesystem, on zoned it depends on the device constraints.
-	 */
-	u64 max_extent_size;
-
-	/* Block groups and devices containing active swapfiles. */
-	spinlock_t swapfile_pins_lock;
-	struct rb_root swapfile_pins;
-
-	struct crypto_shash *csum_shash;
-
-	/* Type of exclusive operation running, protected by super_lock */
-	enum btrfs_exclusive_operation exclusive_operation;
-
-	/*
-	 * Zone size > 0 when in ZONED mode, otherwise it's used for a check
-	 * if the mode is enabled
-	 */
-	u64 zone_size;
-
-	/* Max size to emit ZONE_APPEND write command */
-	u64 max_zone_append_size;
-	struct mutex zoned_meta_io_lock;
-	spinlock_t treelog_bg_lock;
-	u64 treelog_bg;
-
-	/*
-	 * Start of the dedicated data relocation block group, protected by
-	 * relocation_bg_lock.
-	 */
-	spinlock_t relocation_bg_lock;
-	u64 data_reloc_bg;
-	struct mutex zoned_data_reloc_io_lock;
-
-	u64 nr_global_roots;
-
-	spinlock_t zone_active_bgs_lock;
-	struct list_head zone_active_bgs;
-
-	/* Updates are not protected by any lock */
-	struct btrfs_commit_stats commit_stats;
-
-	/*
-	 * Last generation where we dropped a non-relocation root.
-	 * Use btrfs_set_last_root_drop_gen() and btrfs_get_last_root_drop_gen()
-	 * to change it and to read it, respectively.
-	 */
-	u64 last_root_drop_gen;
-
-	/*
-	 * Annotations for transaction events (structures are empty when
-	 * compiled without lockdep).
-	 */
-	struct lockdep_map btrfs_trans_num_writers_map;
-	struct lockdep_map btrfs_trans_num_extwriters_map;
-	struct lockdep_map btrfs_state_change_map[4];
-	struct lockdep_map btrfs_trans_pending_ordered_map;
-	struct lockdep_map btrfs_ordered_extent_map;
-
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
-	spinlock_t ref_verify_lock;
-	struct rb_root block_tree;
-#endif
-
-#ifdef CONFIG_BTRFS_DEBUG
-	struct kobject *debug_kobj;
-	struct list_head allocated_roots;
-
-	spinlock_t eb_leak_lock;
-	struct list_head allocated_ebs;
-#endif
-};
-
-static inline void btrfs_set_last_root_drop_gen(struct btrfs_fs_info *fs_info,
-						u64 gen)
-{
-	WRITE_ONCE(fs_info->last_root_drop_gen, gen);
-}
-
-static inline u64 btrfs_get_last_root_drop_gen(const struct btrfs_fs_info *fs_info)
-{
-	return READ_ONCE(fs_info->last_root_drop_gen);
-}
-
-static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
-{
-	return sb->s_fs_info;
-}
-
-/*
- * Take the number of bytes to be checksummed and figure out how many leaves
- * it would require to store the csums for that many bytes.
- */
-static inline u64 btrfs_csum_bytes_to_leaves(
-			const struct btrfs_fs_info *fs_info, u64 csum_bytes)
-{
-	const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits;
-
-	return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf);
-}
-
-/*
- * Use this if we would be adding new items, as we could split nodes as we cow
- * down the tree.
- */
-static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info,
-						  unsigned num_items)
-{
-	return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
-}
-
-/*
- * Doing a truncate or a modification won't result in new nodes or leaves, just
- * what we need for COW.
- */
-static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info,
-						 unsigned num_items)
-{
-	return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
-}
-
-#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
-					sizeof(struct btrfs_item))
-
-static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
-{
-	return fs_info->zone_size > 0;
-}
-
-/*
- * Count how many fs_info->max_extent_size cover the @size
- */
-static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size)
-{
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-	if (!fs_info)
-		return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE);
-#endif
-
-	return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size);
-}
-
-bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
-			enum btrfs_exclusive_operation type);
-bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
-				 enum btrfs_exclusive_operation type);
-void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info);
-void btrfs_exclop_finish(struct btrfs_fs_info *fs_info);
-void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
-			  enum btrfs_exclusive_operation op);
-
 /*
  * The state of btrfs root
  */
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index af356feec0c7..d8c931014c2a 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -3,6 +3,24 @@
 #ifndef BTRFS_FS_H
 #define BTRFS_FS_H
 
+#define BTRFS_MAX_EXTENT_SIZE SZ_128M
+
+#define BTRFS_OLDEST_GENERATION	0ULL
+
+#define BTRFS_EMPTY_DIR_SIZE 0
+
+#define BTRFS_DIRTY_METADATA_THRESH		SZ_32M
+
+#define BTRFS_SUPER_INFO_OFFSET			SZ_64K
+#define BTRFS_SUPER_INFO_SIZE			4096
+static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
+
+/*
+ * The reserved space at the beginning of each device.  It covers the primary
+ * super block and leaves space for potential use by other tools like
+ * bootloaders or to lower potential damage of accidental overwrite.
+ */
+#define BTRFS_DEVICE_RANGE_RESERVED			(SZ_1M)
 /*
  * Runtime (in-memory) states of filesystem
  */
@@ -200,6 +218,648 @@ enum {
 #define BTRFS_DEFAULT_COMMIT_INTERVAL	(30)
 #define BTRFS_DEFAULT_MAX_INLINE	(2048)
 
+struct btrfs_dev_replace {
+	/* See #define above */
+	u64 replace_state;
+	/* Seconds since 1-Jan-1970 */
+	time64_t time_started;
+	/* Seconds since 1-Jan-1970 */
+	time64_t time_stopped;
+	atomic64_t num_write_errors;
+	atomic64_t num_uncorrectable_read_errors;
+
+	u64 cursor_left;
+	u64 committed_cursor_left;
+	u64 cursor_left_last_write_of_item;
+	u64 cursor_right;
+
+	/* See #define above */
+	u64 cont_reading_from_srcdev_mode;
+
+	int is_valid;
+	int item_needs_writeback;
+	struct btrfs_device *srcdev;
+	struct btrfs_device *tgtdev;
+
+	struct mutex lock_finishing_cancel_unmount;
+	struct rw_semaphore rwsem;
+
+	struct btrfs_scrub_progress scrub_progress;
+
+	struct percpu_counter bio_counter;
+	wait_queue_head_t replace_wait;
+};
+
+/*
+ * Free clusters are used to claim free space in relatively large chunks,
+ * allowing us to do less seeky writes. They are used for all metadata
+ * allocations. In ssd_spread mode they are also used for data allocations.
+ */
+struct btrfs_free_cluster {
+	spinlock_t lock;
+	spinlock_t refill_lock;
+	struct rb_root root;
+
+	/* Largest extent in this cluster */
+	u64 max_size;
+
+	/* First extent starting offset */
+	u64 window_start;
+
+	/* We did a full search and couldn't create a cluster */
+	bool fragmented;
+
+	struct btrfs_block_group *block_group;
+	/*
+	 * When a cluster is allocated from a block group, we put the cluster
+	 * onto a list in the block group so that it can be freed before the
+	 * block group is freed.
+	 */
+	struct list_head block_group_list;
+};
+
+/* Discard control. */
+/*
+ * Async discard uses multiple lists to differentiate the discard filter
+ * parameters.  Index 0 is for completely free block groups where we need to
+ * ensure the entire block group is trimmed without being lossy.  Indices
+ * afterwards represent monotonically decreasing discard filter sizes to
+ * prioritize what should be discarded next.
+ */
+#define BTRFS_NR_DISCARD_LISTS		3
+#define BTRFS_DISCARD_INDEX_UNUSED	0
+#define BTRFS_DISCARD_INDEX_START	1
+
+struct btrfs_discard_ctl {
+	struct workqueue_struct *discard_workers;
+	struct delayed_work work;
+	spinlock_t lock;
+	struct btrfs_block_group *block_group;
+	struct list_head discard_list[BTRFS_NR_DISCARD_LISTS];
+	u64 prev_discard;
+	u64 prev_discard_time;
+	atomic_t discardable_extents;
+	atomic64_t discardable_bytes;
+	u64 max_discard_size;
+	u64 delay_ms;
+	u32 iops_limit;
+	u32 kbps_limit;
+	u64 discard_extent_bytes;
+	u64 discard_bitmap_bytes;
+	atomic64_t discard_bytes_saved;
+};
+
+/*
+ * Exclusive operations (device replace, resize, device add/remove, balance)
+ */
+enum btrfs_exclusive_operation {
+	BTRFS_EXCLOP_NONE,
+	BTRFS_EXCLOP_BALANCE_PAUSED,
+	BTRFS_EXCLOP_BALANCE,
+	BTRFS_EXCLOP_DEV_ADD,
+	BTRFS_EXCLOP_DEV_REMOVE,
+	BTRFS_EXCLOP_DEV_REPLACE,
+	BTRFS_EXCLOP_RESIZE,
+	BTRFS_EXCLOP_SWAP_ACTIVATE,
+};
+
+/* Store data about transaction commits, exported via sysfs. */
+struct btrfs_commit_stats {
+	/* Total number of commits */
+	u64 commit_count;
+	/* The maximum commit duration so far in ns */
+	u64 max_commit_dur;
+	/* The last commit duration in ns */
+	u64 last_commit_dur;
+	/* The total commit duration in ns */
+	u64 total_commit_dur;
+};
+
+struct btrfs_fs_info {
+	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
+	unsigned long flags;
+	struct btrfs_root *tree_root;
+	struct btrfs_root *chunk_root;
+	struct btrfs_root *dev_root;
+	struct btrfs_root *fs_root;
+	struct btrfs_root *quota_root;
+	struct btrfs_root *uuid_root;
+	struct btrfs_root *data_reloc_root;
+	struct btrfs_root *block_group_root;
+
+	/* The log root tree is a directory of all the other log roots */
+	struct btrfs_root *log_root_tree;
+
+	/* The tree that holds the global roots (csum, extent, etc) */
+	rwlock_t global_root_lock;
+	struct rb_root global_root_tree;
+
+	spinlock_t fs_roots_radix_lock;
+	struct radix_tree_root fs_roots_radix;
+
+	/* Block group cache stuff */
+	rwlock_t block_group_cache_lock;
+	struct rb_root_cached block_group_cache_tree;
+
+	/* Keep track of unallocated space */
+	atomic64_t free_chunk_space;
+
+	/* Track ranges which are used by log trees blocks/logged data extents */
+	struct extent_io_tree excluded_extents;
+
+	/* logical->physical extent mapping */
+	struct extent_map_tree mapping_tree;
+
+	/*
+	 * Block reservation for extent, checksum, root tree and delayed dir
+	 * index item.
+	 */
+	struct btrfs_block_rsv global_block_rsv;
+	/* Block reservation for metadata operations */
+	struct btrfs_block_rsv trans_block_rsv;
+	/* Block reservation for chunk tree */
+	struct btrfs_block_rsv chunk_block_rsv;
+	/* Block reservation for delayed operations */
+	struct btrfs_block_rsv delayed_block_rsv;
+	/* Block reservation for delayed refs */
+	struct btrfs_block_rsv delayed_refs_rsv;
+
+	struct btrfs_block_rsv empty_block_rsv;
+
+	u64 generation;
+	u64 last_trans_committed;
+	/*
+	 * Generation of the last transaction used for block group relocation
+	 * since the filesystem was last mounted (or 0 if none happened yet).
+	 * Must be written and read while holding btrfs_fs_info::commit_root_sem.
+	 */
+	u64 last_reloc_trans;
+	u64 avg_delayed_ref_runtime;
+
+	/*
+	 * This is updated to the current trans every time a full commit is
+	 * required instead of the faster short fsync log commits
+	 */
+	u64 last_trans_log_full_commit;
+	unsigned long mount_opt;
+
+	unsigned long compress_type:4;
+	unsigned int compress_level;
+	u32 commit_interval;
+	/*
+	 * It is a suggestive number, the read side is safe even it gets a
+	 * wrong number because we will write out the data into a regular
+	 * extent. The write side(mount/remount) is under ->s_umount lock,
+	 * so it is also safe.
+	 */
+	u64 max_inline;
+
+	struct btrfs_transaction *running_transaction;
+	wait_queue_head_t transaction_throttle;
+	wait_queue_head_t transaction_wait;
+	wait_queue_head_t transaction_blocked_wait;
+	wait_queue_head_t async_submit_wait;
+
+	/*
+	 * Used to protect the incompat_flags, compat_flags, compat_ro_flags
+	 * when they are updated.
+	 *
+	 * Because we do not clear the flags for ever, so we needn't use
+	 * the lock on the read side.
+	 *
+	 * We also needn't use the lock when we mount the fs, because
+	 * there is no other task which will update the flag.
+	 */
+	spinlock_t super_lock;
+	struct btrfs_super_block *super_copy;
+	struct btrfs_super_block *super_for_commit;
+	struct super_block *sb;
+	struct inode *btree_inode;
+	struct mutex tree_log_mutex;
+	struct mutex transaction_kthread_mutex;
+	struct mutex cleaner_mutex;
+	struct mutex chunk_mutex;
+
+	/*
+	 * This is taken to make sure we don't set block groups ro after the
+	 * free space cache has been allocated on them.
+	 */
+	struct mutex ro_block_group_mutex;
+
+	/*
+	 * This is used during read/modify/write to make sure no two ios are
+	 * trying to mod the same stripe at the same time.
+	 */
+	struct btrfs_stripe_hash_table *stripe_hash_table;
+
+	/*
+	 * This protects the ordered operations list only while we are
+	 * processing all of the entries on it.  This way we make sure the
+	 * commit code doesn't find the list temporarily empty because another
+	 * function happens to be doing non-waiting preflush before jumping
+	 * into the main commit.
+	 */
+	struct mutex ordered_operations_mutex;
+
+	struct rw_semaphore commit_root_sem;
+
+	struct rw_semaphore cleanup_work_sem;
+
+	struct rw_semaphore subvol_sem;
+
+	spinlock_t trans_lock;
+	/*
+	 * The reloc mutex goes with the trans lock, it is taken during commit
+	 * to protect us from the relocation code.
+	 */
+	struct mutex reloc_mutex;
+
+	struct list_head trans_list;
+	struct list_head dead_roots;
+	struct list_head caching_block_groups;
+
+	spinlock_t delayed_iput_lock;
+	struct list_head delayed_iputs;
+	atomic_t nr_delayed_iputs;
+	wait_queue_head_t delayed_iputs_wait;
+
+	atomic64_t tree_mod_seq;
+
+	/* This protects tree_mod_log and tree_mod_seq_list */
+	rwlock_t tree_mod_log_lock;
+	struct rb_root tree_mod_log;
+	struct list_head tree_mod_seq_list;
+
+	atomic_t async_delalloc_pages;
+
+	/* This is used to protect the following list -- ordered_roots. */
+	spinlock_t ordered_root_lock;
+
+	/*
+	 * All fs/file tree roots in which there are data=ordered extents
+	 * pending writeback are added into this list.
+	 *
+	 * These can span multiple transactions and basically include every
+	 * dirty data page that isn't from nodatacow.
+	 */
+	struct list_head ordered_roots;
+
+	struct mutex delalloc_root_mutex;
+	spinlock_t delalloc_root_lock;
+	/* All fs/file tree roots that have delalloc inodes. */
+	struct list_head delalloc_roots;
+
+	/*
+	 * There is a pool of worker threads for checksumming during writes and
+	 * a pool for checksumming after reads.  This is because readers can
+	 * run with FS locks held, and the writers may be waiting for those
+	 * locks.  We don't want ordering in the pending list to cause
+	 * deadlocks, and so the two are serviced separately.
+	 *
+	 * A third pool does submit_bio to avoid deadlocking with the other two.
+	 */
+	struct btrfs_workqueue *workers;
+	struct btrfs_workqueue *hipri_workers;
+	struct btrfs_workqueue *delalloc_workers;
+	struct btrfs_workqueue *flush_workers;
+	struct workqueue_struct *endio_workers;
+	struct workqueue_struct *endio_meta_workers;
+	struct workqueue_struct *endio_raid56_workers;
+	struct workqueue_struct *rmw_workers;
+	struct workqueue_struct *compressed_write_workers;
+	struct btrfs_workqueue *endio_write_workers;
+	struct btrfs_workqueue *endio_freespace_worker;
+	struct btrfs_workqueue *caching_workers;
+
+	/*
+	 * Fixup workers take dirty pages that didn't properly go through the
+	 * cow mechanism and make them safe to write.  It happens for the
+	 * sys_munmap function call path.
+	 */
+	struct btrfs_workqueue *fixup_workers;
+	struct btrfs_workqueue *delayed_workers;
+
+	struct task_struct *transaction_kthread;
+	struct task_struct *cleaner_kthread;
+	u32 thread_pool_size;
+
+	struct kobject *space_info_kobj;
+	struct kobject *qgroups_kobj;
+	struct kobject *discard_kobj;
+
+	/* Used to keep from writing metadata until there is a nice batch */
+	struct percpu_counter dirty_metadata_bytes;
+	struct percpu_counter delalloc_bytes;
+	struct percpu_counter ordered_bytes;
+	s32 dirty_metadata_batch;
+	s32 delalloc_batch;
+
+	struct list_head dirty_cowonly_roots;
+
+	struct btrfs_fs_devices *fs_devices;
+
+	/*
+	 * The space_info list is effectively read only after initial setup.
+	 * It is populated at mount time and cleaned up after all block groups
+	 * are removed.  RCU is used to protect it.
+	 */
+	struct list_head space_info;
+
+	struct btrfs_space_info *data_sinfo;
+
+	struct reloc_control *reloc_ctl;
+
+	/* data_alloc_cluster is only used in ssd_spread mode */
+	struct btrfs_free_cluster data_alloc_cluster;
+
+	/* All metadata allocations go through this cluster. */
+	struct btrfs_free_cluster meta_alloc_cluster;
+
+	/* Auto defrag inodes go here. */
+	spinlock_t defrag_inodes_lock;
+	struct rb_root defrag_inodes;
+	atomic_t defrag_running;
+
+	/* Used to protect avail_{data, metadata, system}_alloc_bits */
+	seqlock_t profiles_lock;
+	/*
+	 * These three are in extended format (availability of single chunks is
+	 * denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other types are denoted
+	 * by corresponding BTRFS_BLOCK_GROUP_* bits)
+	 */
+	u64 avail_data_alloc_bits;
+	u64 avail_metadata_alloc_bits;
+	u64 avail_system_alloc_bits;
+
+	/* Balance state */
+	spinlock_t balance_lock;
+	struct mutex balance_mutex;
+	atomic_t balance_pause_req;
+	atomic_t balance_cancel_req;
+	struct btrfs_balance_control *balance_ctl;
+	wait_queue_head_t balance_wait_q;
+
+	/* Cancellation requests for chunk relocation */
+	atomic_t reloc_cancel_req;
+
+	u32 data_chunk_allocations;
+	u32 metadata_ratio;
+
+	void *bdev_holder;
+
+	/* Private scrub information */
+	struct mutex scrub_lock;
+	atomic_t scrubs_running;
+	atomic_t scrub_pause_req;
+	atomic_t scrubs_paused;
+	atomic_t scrub_cancel_req;
+	wait_queue_head_t scrub_pause_wait;
+	/*
+	 * The worker pointers are NULL iff the refcount is 0, ie. scrub is not
+	 * running.
+	 */
+	refcount_t scrub_workers_refcnt;
+	struct workqueue_struct *scrub_workers;
+	struct workqueue_struct *scrub_wr_completion_workers;
+	struct workqueue_struct *scrub_parity_workers;
+	struct btrfs_subpage_info *subpage_info;
+
+	struct btrfs_discard_ctl discard_ctl;
+
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+	u32 check_integrity_print_mask;
+#endif
+	/* Is qgroup tracking in a consistent state? */
+	u64 qgroup_flags;
+
+	/* Holds configuration and tracking. Protected by qgroup_lock. */
+	struct rb_root qgroup_tree;
+	spinlock_t qgroup_lock;
+
+	/*
+	 * Used to avoid frequently calling ulist_alloc()/ulist_free()
+	 * when doing qgroup accounting, it must be protected by qgroup_lock.
+	 */
+	struct ulist *qgroup_ulist;
+
+	/*
+	 * Protect user change for quota operations. If a transaction is needed,
+	 * it must be started before locking this lock.
+	 */
+	struct mutex qgroup_ioctl_lock;
+
+	/* List of dirty qgroups to be written at next commit. */
+	struct list_head dirty_qgroups;
+
+	/* Used by qgroup for an efficient tree traversal. */
+	u64 qgroup_seq;
+
+	/* Qgroup rescan items. */
+	/* Protects the progress item */
+	struct mutex qgroup_rescan_lock;
+	struct btrfs_key qgroup_rescan_progress;
+	struct btrfs_workqueue *qgroup_rescan_workers;
+	struct completion qgroup_rescan_completion;
+	struct btrfs_work qgroup_rescan_work;
+	/* Protected by qgroup_rescan_lock */
+	bool qgroup_rescan_running;
+	u8 qgroup_drop_subtree_thres;
+
+	/* Filesystem state */
+	unsigned long fs_state;
+
+	struct btrfs_delayed_root *delayed_root;
+
+	/* Extent buffer radix tree */
+	spinlock_t buffer_lock;
+	/* Entries are eb->start / sectorsize */
+	struct radix_tree_root buffer_radix;
+
+	/* Next backup root to be overwritten */
+	int backup_root_index;
+
+	/* Device replace state */
+	struct btrfs_dev_replace dev_replace;
+
+	struct semaphore uuid_tree_rescan_sem;
+
+	/* Used to reclaim the metadata space in the background. */
+	struct work_struct async_reclaim_work;
+	struct work_struct async_data_reclaim_work;
+	struct work_struct preempt_reclaim_work;
+
+	/* Reclaim partially filled block groups in the background */
+	struct work_struct reclaim_bgs_work;
+	struct list_head reclaim_bgs;
+	int bg_reclaim_threshold;
+
+	spinlock_t unused_bgs_lock;
+	struct list_head unused_bgs;
+	struct mutex unused_bg_unpin_mutex;
+	/* Protect block groups that are going to be deleted */
+	struct mutex reclaim_bgs_lock;
+
+	/* Cached block sizes */
+	u32 nodesize;
+	u32 sectorsize;
+	/* ilog2 of sectorsize, use to avoid 64bit division */
+	u32 sectorsize_bits;
+	u32 csum_size;
+	u32 csums_per_leaf;
+	u32 stripesize;
+
+	/*
+	 * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular
+	 * filesystem, on zoned it depends on the device constraints.
+	 */
+	u64 max_extent_size;
+
+	/* Block groups and devices containing active swapfiles. */
+	spinlock_t swapfile_pins_lock;
+	struct rb_root swapfile_pins;
+
+	struct crypto_shash *csum_shash;
+
+	/* Type of exclusive operation running, protected by super_lock */
+	enum btrfs_exclusive_operation exclusive_operation;
+
+	/*
+	 * Zone size > 0 when in ZONED mode, otherwise it's used for a check
+	 * if the mode is enabled
+	 */
+	u64 zone_size;
+
+	/* Max size to emit ZONE_APPEND write command */
+	u64 max_zone_append_size;
+	struct mutex zoned_meta_io_lock;
+	spinlock_t treelog_bg_lock;
+	u64 treelog_bg;
+
+	/*
+	 * Start of the dedicated data relocation block group, protected by
+	 * relocation_bg_lock.
+	 */
+	spinlock_t relocation_bg_lock;
+	u64 data_reloc_bg;
+	struct mutex zoned_data_reloc_io_lock;
+
+	u64 nr_global_roots;
+
+	spinlock_t zone_active_bgs_lock;
+	struct list_head zone_active_bgs;
+
+	/* Updates are not protected by any lock */
+	struct btrfs_commit_stats commit_stats;
+
+	/*
+	 * Last generation where we dropped a non-relocation root.
+	 * Use btrfs_set_last_root_drop_gen() and btrfs_get_last_root_drop_gen()
+	 * to change it and to read it, respectively.
+	 */
+	u64 last_root_drop_gen;
+
+	/*
+	 * Annotations for transaction events (structures are empty when
+	 * compiled without lockdep).
+	 */
+	struct lockdep_map btrfs_trans_num_writers_map;
+	struct lockdep_map btrfs_trans_num_extwriters_map;
+	struct lockdep_map btrfs_state_change_map[4];
+	struct lockdep_map btrfs_trans_pending_ordered_map;
+	struct lockdep_map btrfs_ordered_extent_map;
+
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+	spinlock_t ref_verify_lock;
+	struct rb_root block_tree;
+#endif
+
+#ifdef CONFIG_BTRFS_DEBUG
+	struct kobject *debug_kobj;
+	struct list_head allocated_roots;
+
+	spinlock_t eb_leak_lock;
+	struct list_head allocated_ebs;
+#endif
+};
+
+static inline void btrfs_set_last_root_drop_gen(struct btrfs_fs_info *fs_info,
+						u64 gen)
+{
+	WRITE_ONCE(fs_info->last_root_drop_gen, gen);
+}
+
+static inline u64 btrfs_get_last_root_drop_gen(const struct btrfs_fs_info *fs_info)
+{
+	return READ_ONCE(fs_info->last_root_drop_gen);
+}
+
+static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+/*
+ * Take the number of bytes to be checksummed and figure out how many leaves
+ * it would require to store the csums for that many bytes.
+ */
+static inline u64 btrfs_csum_bytes_to_leaves(
+			const struct btrfs_fs_info *fs_info, u64 csum_bytes)
+{
+	const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits;
+
+	return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf);
+}
+
+/*
+ * Use this if we would be adding new items, as we could split nodes as we cow
+ * down the tree.
+ */
+static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info,
+						  unsigned num_items)
+{
+	return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
+}
+
+/*
+ * Doing a truncate or a modification won't result in new nodes or leaves, just
+ * what we need for COW.
+ */
+static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info,
+						 unsigned num_items)
+{
+	return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
+}
+
+#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
+					sizeof(struct btrfs_item))
+
+static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
+{
+	return fs_info->zone_size > 0;
+}
+
+/*
+ * Count how many fs_info->max_extent_size cover the @size
+ */
+static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size)
+{
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+	if (!fs_info)
+		return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE);
+#endif
+
+	return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size);
+}
+
+bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
+			enum btrfs_exclusive_operation type);
+bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
+				 enum btrfs_exclusive_operation type);
+void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info);
+void btrfs_exclop_finish(struct btrfs_fs_info *fs_info);
+void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
+			  enum btrfs_exclusive_operation op);
+
 /* Compatibility and incompatibility defines */
 void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
 			     const char *name);
-- 
cgit 


From eb33a4d65b2adae52530d6c3c7a9f58bf734a750 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Mon, 24 Oct 2022 14:46:53 -0400
Subject: btrfs: move the lockdep helpers into locking.h

These more naturally fit in with the locking related code, and they're
all defines so they can easily go anywhere, move them out of ctree.h
into locking.h

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h   | 76 ------------------------------------------------------
 fs/btrfs/locking.h | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+), 76 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ace6f41612e5..970a22a04c94 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -181,82 +181,6 @@ enum {
 	BTRFS_ROOT_RESET_LOCKDEP_CLASS,
 };
 
-enum btrfs_lockdep_trans_states {
-	BTRFS_LOCKDEP_TRANS_COMMIT_START,
-	BTRFS_LOCKDEP_TRANS_UNBLOCKED,
-	BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED,
-	BTRFS_LOCKDEP_TRANS_COMPLETED,
-};
-
-/*
- * Lockdep annotation for wait events.
- *
- * @owner:  The struct where the lockdep map is defined
- * @lock:   The lockdep map corresponding to a wait event
- *
- * This macro is used to annotate a wait event. In this case a thread acquires
- * the lockdep map as writer (exclusive lock) because it has to block until all
- * the threads that hold the lock as readers signal the condition for the wait
- * event and release their locks.
- */
-#define btrfs_might_wait_for_event(owner, lock)					\
-	do {									\
-		rwsem_acquire(&owner->lock##_map, 0, 0, _THIS_IP_);		\
-		rwsem_release(&owner->lock##_map, _THIS_IP_);			\
-	} while (0)
-
-/*
- * Protection for the resource/condition of a wait event.
- *
- * @owner:  The struct where the lockdep map is defined
- * @lock:   The lockdep map corresponding to a wait event
- *
- * Many threads can modify the condition for the wait event at the same time
- * and signal the threads that block on the wait event. The threads that modify
- * the condition and do the signaling acquire the lock as readers (shared
- * lock).
- */
-#define btrfs_lockdep_acquire(owner, lock)					\
-	rwsem_acquire_read(&owner->lock##_map, 0, 0, _THIS_IP_)
-
-/*
- * Used after signaling the condition for a wait event to release the lockdep
- * map held by a reader thread.
- */
-#define btrfs_lockdep_release(owner, lock)					\
-	rwsem_release(&owner->lock##_map, _THIS_IP_)
-
-/*
- * Macros for the transaction states wait events, similar to the generic wait
- * event macros.
- */
-#define btrfs_might_wait_for_state(owner, i)					\
-	do {									\
-		rwsem_acquire(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_); \
-		rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_);	\
-	} while (0)
-
-#define btrfs_trans_state_lockdep_acquire(owner, i)				\
-	rwsem_acquire_read(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_)
-
-#define btrfs_trans_state_lockdep_release(owner, i)				\
-	rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_)
-
-/* Initialization of the lockdep map */
-#define btrfs_lockdep_init_map(owner, lock)					\
-	do {									\
-		static struct lock_class_key lock##_key;			\
-		lockdep_init_map(&owner->lock##_map, #lock, &lock##_key, 0);	\
-	} while (0)
-
-/* Initialization of the transaction states lockdep maps. */
-#define btrfs_state_lockdep_init_map(owner, lock, state)			\
-	do {									\
-		static struct lock_class_key lock##_key;			\
-		lockdep_init_map(&owner->btrfs_state_change_map[state], #lock,	\
-				 &lock##_key, 0);				\
-	} while (0)
-
 /*
  * Record swapped tree blocks of a subvolume tree for delayed subtree trace
  * code. For detail check comment in fs/btrfs/qgroup.c.
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 490c7a79e995..11c2269b4b6f 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -78,6 +78,82 @@ enum btrfs_lock_nesting {
 	BTRFS_NESTING_MAX,
 };
 
+enum btrfs_lockdep_trans_states {
+	BTRFS_LOCKDEP_TRANS_COMMIT_START,
+	BTRFS_LOCKDEP_TRANS_UNBLOCKED,
+	BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED,
+	BTRFS_LOCKDEP_TRANS_COMPLETED,
+};
+
+/*
+ * Lockdep annotation for wait events.
+ *
+ * @owner:  The struct where the lockdep map is defined
+ * @lock:   The lockdep map corresponding to a wait event
+ *
+ * This macro is used to annotate a wait event. In this case a thread acquires
+ * the lockdep map as writer (exclusive lock) because it has to block until all
+ * the threads that hold the lock as readers signal the condition for the wait
+ * event and release their locks.
+ */
+#define btrfs_might_wait_for_event(owner, lock)					\
+	do {									\
+		rwsem_acquire(&owner->lock##_map, 0, 0, _THIS_IP_);		\
+		rwsem_release(&owner->lock##_map, _THIS_IP_);			\
+	} while (0)
+
+/*
+ * Protection for the resource/condition of a wait event.
+ *
+ * @owner:  The struct where the lockdep map is defined
+ * @lock:   The lockdep map corresponding to a wait event
+ *
+ * Many threads can modify the condition for the wait event at the same time
+ * and signal the threads that block on the wait event. The threads that modify
+ * the condition and do the signaling acquire the lock as readers (shared
+ * lock).
+ */
+#define btrfs_lockdep_acquire(owner, lock)					\
+	rwsem_acquire_read(&owner->lock##_map, 0, 0, _THIS_IP_)
+
+/*
+ * Used after signaling the condition for a wait event to release the lockdep
+ * map held by a reader thread.
+ */
+#define btrfs_lockdep_release(owner, lock)					\
+	rwsem_release(&owner->lock##_map, _THIS_IP_)
+
+/*
+ * Macros for the transaction states wait events, similar to the generic wait
+ * event macros.
+ */
+#define btrfs_might_wait_for_state(owner, i)					\
+	do {									\
+		rwsem_acquire(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_); \
+		rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_);	\
+	} while (0)
+
+#define btrfs_trans_state_lockdep_acquire(owner, i)				\
+	rwsem_acquire_read(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_)
+
+#define btrfs_trans_state_lockdep_release(owner, i)				\
+	rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_)
+
+/* Initialization of the lockdep map */
+#define btrfs_lockdep_init_map(owner, lock)					\
+	do {									\
+		static struct lock_class_key lock##_key;			\
+		lockdep_init_map(&owner->lock##_map, #lock, &lock##_key, 0);	\
+	} while (0)
+
+/* Initialization of the transaction states lockdep maps. */
+#define btrfs_state_lockdep_init_map(owner, lock, state)			\
+	do {									\
+		static struct lock_class_key lock##_key;			\
+		lockdep_init_map(&owner->btrfs_state_change_map[state], #lock,	\
+				 &lock##_key, 0);				\
+	} while (0)
+
 static_assert(BTRFS_NESTING_MAX <= MAX_LOCKDEP_SUBCLASSES,
 	      "too many lock subclasses defined");
 
-- 
cgit 


From 13d925c1c26916c017b5339625c66bd20df9fd96 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Mon, 24 Oct 2022 14:46:54 -0400
Subject: btrfs: minor whitespace in ctree.h

We've accumulated some whitespace problems in ctree.h, clean these up.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 970a22a04c94..bf94531f4486 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -435,10 +435,8 @@ struct btrfs_file_private {
 	void *filldir_buf;
 };
 
-
 static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info)
 {
-
 	return info->nodesize - sizeof(struct btrfs_header);
 }
 
-- 
cgit 


From 8483d40242c56b0b225238efed1f73ef6985c5af Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Mon, 24 Oct 2022 14:46:55 -0400
Subject: btrfs: remove extra space info prototypes in ctree.h

These are defined already in space-info.h, remove them from ctree.h.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index bf94531f4486..a430577cf8b7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -567,8 +567,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 			 struct btrfs_ref *generic_ref);
 
-void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
-
 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
 				     struct btrfs_block_rsv *rsv,
 				     int nitems, bool use_global_rsv);
@@ -585,7 +583,6 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
 			 u64 num_bytes, u64 *actual_bytes);
 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
 
-int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
 int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
 					 struct btrfs_fs_info *fs_info);
 int btrfs_start_write_no_snapshotting(struct btrfs_root *root);
-- 
cgit 


From e2f13b343c147bc34691301c30d8a8b35d0a2e5c Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Mon, 24 Oct 2022 14:46:56 -0400
Subject: btrfs: move btrfs_account_ro_block_groups_free_space into
 space-info.c

This was prototyped in ctree.h and the code existed in extent-tree.c,
but it's space-info related so move it into space-info.c.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h       |  1 -
 fs/btrfs/extent-tree.c | 34 ----------------------------------
 fs/btrfs/space-info.c  | 34 ++++++++++++++++++++++++++++++++++
 fs/btrfs/space-info.h  |  1 +
 4 files changed, 35 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a430577cf8b7..07f876961da1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -576,7 +576,6 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
 
 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
 				    u64 disk_num_bytes, bool noflush);
-u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
 				   u64 start, u64 end);
 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8d28ba360e9a..b97c99a8dd4c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5975,40 +5975,6 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-/*
- * helper to account the unused space of all the readonly block group in the
- * space_info. takes mirrors into account.
- */
-u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
-{
-	struct btrfs_block_group *block_group;
-	u64 free_bytes = 0;
-	int factor;
-
-	/* It's df, we don't care if it's racy */
-	if (list_empty(&sinfo->ro_bgs))
-		return 0;
-
-	spin_lock(&sinfo->lock);
-	list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
-		spin_lock(&block_group->lock);
-
-		if (!block_group->ro) {
-			spin_unlock(&block_group->lock);
-			continue;
-		}
-
-		factor = btrfs_bg_type_to_factor(block_group->flags);
-		free_bytes += (block_group->length -
-			       block_group->used) * factor;
-
-		spin_unlock(&block_group->lock);
-	}
-	spin_unlock(&sinfo->lock);
-
-	return free_bytes;
-}
-
 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
 				   u64 start, u64 end)
 {
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 94d73485454a..45404798ee8c 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -1814,3 +1814,37 @@ __cold void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info)
 	}
 	dump_global_block_rsv(fs_info);
 }
+
+/*
+ * Account the unused space of all the readonly block group in the space_info.
+ * takes mirrors into account.
+ */
+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
+{
+	struct btrfs_block_group *block_group;
+	u64 free_bytes = 0;
+	int factor;
+
+	/* It's df, we don't care if it's racy */
+	if (list_empty(&sinfo->ro_bgs))
+		return 0;
+
+	spin_lock(&sinfo->lock);
+	list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
+		spin_lock(&block_group->lock);
+
+		if (!block_group->ro) {
+			spin_unlock(&block_group->lock);
+			continue;
+		}
+
+		factor = btrfs_bg_type_to_factor(block_group->flags);
+		free_bytes += (block_group->length -
+			       block_group->used) * factor;
+
+		spin_unlock(&block_group->lock);
+	}
+	spin_unlock(&sinfo->lock);
+
+	return free_bytes;
+}
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index f28bd2c051d4..fc99ea2b0c34 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -236,5 +236,6 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
 			     enum btrfs_reserve_flush_enum flush);
 void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info);
 void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info);
+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
 
 #endif /* BTRFS_SPACE_INFO_H */
-- 
cgit 


From a0231804affe78d27264811559ee31bd341c2bff Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Mon, 24 Oct 2022 14:46:57 -0400
Subject: btrfs: move extent-tree helpers into their own header file

Move all the extent tree related prototypes to extent-tree.h out of
ctree.h, and then go include it everywhere needed so everything
compiles.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c         |  1 +
 fs/btrfs/block-group.c     |  1 +
 fs/btrfs/ctree.c           |  1 +
 fs/btrfs/ctree.h           | 72 ----------------------------------------------
 fs/btrfs/disk-io.c         |  1 +
 fs/btrfs/extent-tree.c     |  1 +
 fs/btrfs/extent-tree.h     | 72 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/file.c            |  1 +
 fs/btrfs/free-space-tree.c |  1 +
 fs/btrfs/inode-item.c      |  1 +
 fs/btrfs/inode.c           |  1 +
 fs/btrfs/ioctl.c           |  1 +
 fs/btrfs/qgroup.c          |  1 +
 fs/btrfs/relocation.c      |  1 +
 fs/btrfs/space-info.c      |  1 +
 fs/btrfs/transaction.c     |  1 +
 fs/btrfs/tree-log.c        |  1 +
 17 files changed, 87 insertions(+), 72 deletions(-)
 create mode 100644 fs/btrfs/extent-tree.h

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index f556566b9c79..173df40da984 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -17,6 +17,7 @@
 #include "tree-mod-log.h"
 #include "fs.h"
 #include "accessors.h"
+#include "extent-tree.h"
 
 /* Just arbitrary numbers so we can be sure one of these happened. */
 #define BACKREF_FOUND_SHARED     6
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index d1f8d792b184..5a743c4d4e97 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -19,6 +19,7 @@
 #include "zoned.h"
 #include "fs.h"
 #include "accessors.h"
+#include "extent-tree.h"
 
 #ifdef CONFIG_BTRFS_DEBUG
 int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group)
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 1283ca46c5bc..f0ddb44704ee 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -20,6 +20,7 @@
 #include "tree-checker.h"
 #include "fs.h"
 #include "accessors.h"
+#include "extent-tree.h"
 
 static struct kmem_cache *btrfs_path_cachep;
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 07f876961da1..8ca67227dec0 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -495,78 +495,6 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
 	return mapping_gfp_constraint(mapping, ~__GFP_FS);
 }
 
-/* extent-tree.c */
-
-enum btrfs_inline_ref_type {
-	BTRFS_REF_TYPE_INVALID,
-	BTRFS_REF_TYPE_BLOCK,
-	BTRFS_REF_TYPE_DATA,
-	BTRFS_REF_TYPE_ANY,
-};
-
-int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
-				     struct btrfs_extent_inline_ref *iref,
-				     enum btrfs_inline_ref_type is_data);
-u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset);
-
-
-int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
-			      u64 start, u64 num_bytes);
-void btrfs_free_excluded_extents(struct btrfs_block_group *cache);
-int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
-			   unsigned long count);
-void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
-				  struct btrfs_delayed_ref_root *delayed_refs,
-				  struct btrfs_delayed_ref_head *head);
-int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len);
-int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
-			     struct btrfs_fs_info *fs_info, u64 bytenr,
-			     u64 offset, int metadata, u64 *refs, u64 *flags);
-int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num,
-		     int reserved);
-int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
-				    u64 bytenr, u64 num_bytes);
-int btrfs_exclude_logged_extents(struct extent_buffer *eb);
-int btrfs_cross_ref_exist(struct btrfs_root *root,
-			  u64 objectid, u64 offset, u64 bytenr, bool strict,
-			  struct btrfs_path *path);
-struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
-					     struct btrfs_root *root,
-					     u64 parent, u64 root_objectid,
-					     const struct btrfs_disk_key *key,
-					     int level, u64 hint,
-					     u64 empty_size,
-					     enum btrfs_lock_nesting nest);
-void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
-			   u64 root_id,
-			   struct extent_buffer *buf,
-			   u64 parent, int last_ref);
-int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
-				     struct btrfs_root *root, u64 owner,
-				     u64 offset, u64 ram_bytes,
-				     struct btrfs_key *ins);
-int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
-				   u64 root_objectid, u64 owner, u64 offset,
-				   struct btrfs_key *ins);
-int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes,
-			 u64 min_alloc_size, u64 empty_size, u64 hint_byte,
-			 struct btrfs_key *ins, int is_data, int delalloc);
-int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf, int full_backref);
-int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf, int full_backref);
-int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
-				struct extent_buffer *eb, u64 flags, int level);
-int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref);
-
-int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
-			       u64 start, u64 len, int delalloc);
-int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
-			      u64 len);
-int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
-int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
-			 struct btrfs_ref *generic_ref);
-
 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
 				     struct btrfs_block_rsv *rsv,
 				     int nitems, bool use_global_rsv);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 75475a213972..005da95c9c17 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -45,6 +45,7 @@
 #include "subpage.h"
 #include "fs.h"
 #include "accessors.h"
+#include "extent-tree.h"
 
 #define BTRFS_SUPER_FLAG_SUPP	(BTRFS_HEADER_FLAG_WRITTEN |\
 				 BTRFS_HEADER_FLAG_RELOC |\
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b97c99a8dd4c..d2d5de946242 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -38,6 +38,7 @@
 #include "dev-replace.h"
 #include "fs.h"
 #include "accessors.h"
+#include "extent-tree.h"
 
 #undef SCRAMBLE_DELAYED_REFS
 
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
new file mode 100644
index 000000000000..b3674b008d58
--- /dev/null
+++ b/fs/btrfs/extent-tree.h
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_EXTENT_TREE_H
+#define BTRFS_EXTENT_TREE_H
+
+enum btrfs_inline_ref_type {
+	BTRFS_REF_TYPE_INVALID,
+	BTRFS_REF_TYPE_BLOCK,
+	BTRFS_REF_TYPE_DATA,
+	BTRFS_REF_TYPE_ANY,
+};
+
+int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
+				     struct btrfs_extent_inline_ref *iref,
+				     enum btrfs_inline_ref_type is_data);
+u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset);
+
+int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
+			      u64 start, u64 num_bytes);
+void btrfs_free_excluded_extents(struct btrfs_block_group *cache);
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long count);
+void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
+				  struct btrfs_delayed_ref_root *delayed_refs,
+				  struct btrfs_delayed_ref_head *head);
+int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len);
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info, u64 bytenr,
+			     u64 offset, int metadata, u64 *refs, u64 *flags);
+int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num,
+		     int reserved);
+int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
+				    u64 bytenr, u64 num_bytes);
+int btrfs_exclude_logged_extents(struct extent_buffer *eb);
+int btrfs_cross_ref_exist(struct btrfs_root *root,
+			  u64 objectid, u64 offset, u64 bytenr, bool strict,
+			  struct btrfs_path *path);
+struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
+					     struct btrfs_root *root,
+					     u64 parent, u64 root_objectid,
+					     const struct btrfs_disk_key *key,
+					     int level, u64 hint,
+					     u64 empty_size,
+					     enum btrfs_lock_nesting nest);
+void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+			   u64 root_id,
+			   struct extent_buffer *buf,
+			   u64 parent, int last_ref);
+int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root, u64 owner,
+				     u64 offset, u64 ram_bytes,
+				     struct btrfs_key *ins);
+int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
+				   u64 root_objectid, u64 owner, u64 offset,
+				   struct btrfs_key *ins);
+int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes,
+			 u64 min_alloc_size, u64 empty_size, u64 hint_byte,
+			 struct btrfs_key *ins, int is_data, int delalloc);
+int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		  struct extent_buffer *buf, int full_backref);
+int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		  struct extent_buffer *buf, int full_backref);
+int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
+				struct extent_buffer *eb, u64 flags, int level);
+int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref);
+
+int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
+			       u64 start, u64 len, int delalloc);
+int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, u64 len);
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
+int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref);
+
+#endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 62360a554420..bad676d2e23f 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -32,6 +32,7 @@
 #include "subpage.h"
 #include "fs.h"
 #include "accessors.h"
+#include "extent-tree.h"
 
 static struct kmem_cache *btrfs_inode_defrag_cachep;
 /*
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index fc79d21e7b4f..a652e5dc465b 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -14,6 +14,7 @@
 #include "block-group.h"
 #include "fs.h"
 #include "accessors.h"
+#include "extent-tree.h"
 
 static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
 					struct btrfs_block_group *block_group,
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index d66bdbae5585..724507ce7a7d 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -12,6 +12,7 @@
 #include "print-tree.h"
 #include "space-info.h"
 #include "accessors.h"
+#include "extent-tree.h"
 
 struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
 						   int slot,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 78867a084428..cb2a35b19c75 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -57,6 +57,7 @@
 #include "inode-item.h"
 #include "fs.h"
 #include "accessors.h"
+#include "extent-tree.h"
 
 struct btrfs_iget_args {
 	u64 ino;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2132e3e15986..1d4bdb22881c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -52,6 +52,7 @@
 #include "subpage.h"
 #include "fs.h"
 #include "accessors.h"
+#include "extent-tree.h"
 
 #ifdef CONFIG_64BIT
 /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 4786b59035a8..88b248b22810 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -26,6 +26,7 @@
 #include "tree-mod-log.h"
 #include "fs.h"
 #include "accessors.h"
+#include "extent-tree.h"
 
 /*
  * Helpers to access qgroup reservation
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 77d03dce2521..946190f13138 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -30,6 +30,7 @@
 #include "space-info.h"
 #include "fs.h"
 #include "accessors.h"
+#include "extent-tree.h"
 
 /*
  * Relocation overview
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 45404798ee8c..e5f9f43ffa4c 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -12,6 +12,7 @@
 #include "zoned.h"
 #include "fs.h"
 #include "accessors.h"
+#include "extent-tree.h"
 
 /*
  * HOW DOES SPACE RESERVATION WORK
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 0d44d50dc3be..a04c2a9708e2 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -26,6 +26,7 @@
 #include "zoned.h"
 #include "fs.h"
 #include "accessors.h"
+#include "extent-tree.h"
 
 static struct kmem_cache *btrfs_trans_handle_cachep;
 
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index a5e56a678af2..bd9a5eda9def 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -23,6 +23,7 @@
 #include "inode-item.h"
 #include "fs.h"
 #include "accessors.h"
+#include "extent-tree.h"
 
 #define MAX_CONFLICT_INODES 10
 
-- 
cgit 


From 2839c2c142dd4baae146b7d3e42ead8fe5d36014 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Mon, 24 Oct 2022 14:46:58 -0400
Subject: btrfs: move delalloc space related prototypes to delalloc-space.h

These exist in delalloc-space.c, move them from ctree.h into
delalloc-space.h.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h          | 3 ---
 fs/btrfs/delalloc-space.h | 3 +++
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8ca67227dec0..afb2bb3acc76 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -500,10 +500,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
 				     int nitems, bool use_global_rsv);
 void btrfs_subvolume_release_metadata(struct btrfs_root *root,
 				      struct btrfs_block_rsv *rsv);
-void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
 
-int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
-				    u64 disk_num_bytes, bool noflush);
 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
 				   u64 start, u64 end);
 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h
index e07d46043455..c5d573f2366e 100644
--- a/fs/btrfs/delalloc-space.h
+++ b/fs/btrfs/delalloc-space.h
@@ -20,5 +20,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
 				     bool qgroup_free);
 int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
 			struct extent_changeset **reserved, u64 start, u64 len);
+int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
+				    u64 disk_num_bytes, bool noflush);
+void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
 
 #endif /* BTRFS_DELALLOC_SPACE_H */
-- 
cgit 


From 6d2049a2f36f873b8f793fde7ea1874d0d17a3dc Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Mon, 24 Oct 2022 14:46:59 -0400
Subject: btrfs: delete unused function prototypes in ctree.h

This batch of prototypes no longer have code associated with them, so
remove them.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index afb2bb3acc76..acae38e378a6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -507,12 +507,6 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
 			 u64 num_bytes, u64 *actual_bytes);
 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
 
-int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
-					 struct btrfs_fs_info *fs_info);
-int btrfs_start_write_no_snapshotting(struct btrfs_root *root);
-void btrfs_end_write_no_snapshotting(struct btrfs_root *root);
-void btrfs_wait_for_snapshot_creation(struct btrfs_root *root);
-
 /* ctree.c */
 int __init btrfs_ctree_init(void);
 void __cold btrfs_ctree_exit(void);
-- 
cgit 


From 45c40c8f9541f336c7857f09ea843fc8fa980b65 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Mon, 24 Oct 2022 14:47:00 -0400
Subject: btrfs: move root tree prototypes to their own header

Move all the root-tree.c prototypes to root-tree.h, and then update all
the necessary files to include the new header.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h           | 32 --------------------------------
 fs/btrfs/disk-io.c         |  1 +
 fs/btrfs/extent-tree.c     |  1 +
 fs/btrfs/free-space-tree.c |  1 +
 fs/btrfs/inode.c           |  1 +
 fs/btrfs/ioctl.c           |  1 +
 fs/btrfs/qgroup.c          |  1 +
 fs/btrfs/relocation.c      |  1 +
 fs/btrfs/root-tree.c       |  1 +
 fs/btrfs/root-tree.h       | 34 ++++++++++++++++++++++++++++++++++
 fs/btrfs/transaction.c     |  1 +
 fs/btrfs/tree-log.c        |  1 +
 12 files changed, 44 insertions(+), 32 deletions(-)
 create mode 100644 fs/btrfs/root-tree.h

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index acae38e378a6..77f1f345d040 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -495,12 +495,6 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
 	return mapping_gfp_constraint(mapping, ~__GFP_FS);
 }
 
-int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
-				     struct btrfs_block_rsv *rsv,
-				     int nitems, bool use_global_rsv);
-void btrfs_subvolume_release_metadata(struct btrfs_root *root,
-				      struct btrfs_block_rsv *rsv);
-
 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
 				   u64 start, u64 end);
 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
@@ -695,32 +689,6 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 			struct extent_buffer *node,
 			struct extent_buffer *parent);
 
-/* root-item.c */
-int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
-		       u64 ref_id, u64 dirid, u64 sequence,
-		       const struct fscrypt_str *name);
-int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
-		       u64 ref_id, u64 dirid, u64 *sequence,
-		       const struct fscrypt_str *name);
-int btrfs_del_root(struct btrfs_trans_handle *trans,
-		   const struct btrfs_key *key);
-int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		      const struct btrfs_key *key,
-		      struct btrfs_root_item *item);
-int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
-				   struct btrfs_root *root,
-				   struct btrfs_key *key,
-				   struct btrfs_root_item *item);
-int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key,
-		    struct btrfs_path *path, struct btrfs_root_item *root_item,
-		    struct btrfs_key *root_key);
-int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info);
-void btrfs_set_root_node(struct btrfs_root_item *item,
-			 struct extent_buffer *node);
-void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
-void btrfs_update_root_times(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *root);
-
 /* uuid-tree.c */
 int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
 			u64 subid);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 005da95c9c17..ebc9baab0e47 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -46,6 +46,7 @@
 #include "fs.h"
 #include "accessors.h"
 #include "extent-tree.h"
+#include "root-tree.h"
 
 #define BTRFS_SUPER_FLAG_SUPP	(BTRFS_HEADER_FLAG_WRITTEN |\
 				 BTRFS_HEADER_FLAG_RELOC |\
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d2d5de946242..510be00da406 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -39,6 +39,7 @@
 #include "fs.h"
 #include "accessors.h"
 #include "extent-tree.h"
+#include "root-tree.h"
 
 #undef SCRAMBLE_DELAYED_REFS
 
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index a652e5dc465b..869d062d6765 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -15,6 +15,7 @@
 #include "fs.h"
 #include "accessors.h"
 #include "extent-tree.h"
+#include "root-tree.h"
 
 static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
 					struct btrfs_block_group *block_group,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index cb2a35b19c75..ec9f0f4afd12 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -58,6 +58,7 @@
 #include "fs.h"
 #include "accessors.h"
 #include "extent-tree.h"
+#include "root-tree.h"
 
 struct btrfs_iget_args {
 	u64 ino;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 1d4bdb22881c..52a268e05d4a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -53,6 +53,7 @@
 #include "fs.h"
 #include "accessors.h"
 #include "extent-tree.h"
+#include "root-tree.h"
 
 #ifdef CONFIG_64BIT
 /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 88b248b22810..75dd964ca46c 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -27,6 +27,7 @@
 #include "fs.h"
 #include "accessors.h"
 #include "extent-tree.h"
+#include "root-tree.h"
 
 /*
  * Helpers to access qgroup reservation
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 946190f13138..e345cc71da15 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -31,6 +31,7 @@
 #include "fs.h"
 #include "accessors.h"
 #include "extent-tree.h"
+#include "root-tree.h"
 
 /*
  * Relocation overview
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 6aab98114253..42f046e5e25f 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -14,6 +14,7 @@
 #include "qgroup.h"
 #include "space-info.h"
 #include "accessors.h"
+#include "root-tree.h"
 
 /*
  * Read a root item from the tree. In case we detect a root item smaller then
diff --git a/fs/btrfs/root-tree.h b/fs/btrfs/root-tree.h
new file mode 100644
index 000000000000..cbbaca32126e
--- /dev/null
+++ b/fs/btrfs/root-tree.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_ROOT_TREE_H
+#define BTRFS_ROOT_TREE_H
+
+int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
+				     struct btrfs_block_rsv *rsv,
+				     int nitems, bool use_global_rsv);
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
+				      struct btrfs_block_rsv *rsv);
+int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
+		       u64 ref_id, u64 dirid, u64 sequence,
+		       const struct fscrypt_str *name);
+int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
+		       u64 ref_id, u64 dirid, u64 *sequence,
+		       const struct fscrypt_str *name);
+int btrfs_del_root(struct btrfs_trans_handle *trans, const struct btrfs_key *key);
+int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		      const struct btrfs_key *key,
+		      struct btrfs_root_item *item);
+int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct btrfs_key *key,
+				   struct btrfs_root_item *item);
+int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key,
+		    struct btrfs_path *path, struct btrfs_root_item *root_item,
+		    struct btrfs_key *root_key);
+int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info);
+void btrfs_set_root_node(struct btrfs_root_item *item,
+			 struct extent_buffer *node);
+void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
+void btrfs_update_root_times(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+
+#endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a04c2a9708e2..82b2e2ec90cf 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -27,6 +27,7 @@
 #include "fs.h"
 #include "accessors.h"
 #include "extent-tree.h"
+#include "root-tree.h"
 
 static struct kmem_cache *btrfs_trans_handle_cachep;
 
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index bd9a5eda9def..f185cd286243 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -24,6 +24,7 @@
 #include "fs.h"
 #include "accessors.h"
 #include "extent-tree.h"
+#include "root-tree.h"
 
 #define MAX_CONFLICT_INODES 10
 
-- 
cgit 


From 911bd75aca7390ca9e39191450829b09772c5a77 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Mon, 24 Oct 2022 16:16:14 -0400
Subject: btrfs: remove unused function prototypes

I wrote the following coccinelle script to find function declarations
that didn't have the corresponding code for them

@funcproto@
identifier func;
type T;
position p0;
@@

T func@p0(...);

@funccode@
identifier funcproto.func;
position p1;
@@

func@p1(...) { ... }

@script:python depends on !funccode@
p0 << funcproto.p0;
@@
print("Proto with no function at %s:%s" % (p0[0].file, p0[0].line))

and ran it against btrfs, which identified the 4 function prototypes
I've removed in this patch.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h   | 3 ---
 fs/btrfs/disk-io.h | 4 ----
 2 files changed, 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 77f1f345d040..09005bd9bfef 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -739,7 +739,6 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, u64 offset);
 int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, u64 offset);
-int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
 
 /* file-item.c */
 int btrfs_del_csums(struct btrfs_trans_handle *trans,
@@ -924,8 +923,6 @@ int __pure btrfs_is_empty_uuid(u8 *uuid);
 int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
 		      struct btrfs_ioctl_defrag_range_args *range,
 		      u64 newer_than, unsigned long max_to_defrag);
-void btrfs_get_block_group_info(struct list_head *groups_list,
-				struct btrfs_ioctl_space_info *space);
 void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
 			       struct btrfs_ioctl_balance_args *bargs);
 
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index da040ec327ae..6edc66b4b4d3 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -116,8 +116,6 @@ int btrfs_read_extent_buffer(struct extent_buffer *buf, u64 parent_transid,
 bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num,
 			 u64 dio_file_offset,
 			 extent_submit_bio_start_t *submit_bio_start);
-blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio,
-			  int mirror_num);
 int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root);
 int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
@@ -130,8 +128,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
 				  struct btrfs_fs_info *fs_info);
 struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
 				     u64 objectid);
-int btree_lock_page_hook(struct page *page, void *data,
-				void (*flush_fn)(void *));
 int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
 int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid);
 int btrfs_init_root_free_objectid(struct btrfs_root *root);
-- 
cgit 


From 1751850fbd770c89347e14ae9590219bcd2f0649 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Tue, 25 Oct 2022 11:13:06 -0400
Subject: btrfs: remove unused btrfs_cond_migrate_bytes

The last user of this was removed in 7f9fe6144076 ("btrfs: improve
global reserve stealing logic"), drop this code as it's no longer called
by anybody.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-rsv.c | 25 -------------------------
 fs/btrfs/block-rsv.h |  3 ---
 2 files changed, 28 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index 069ea9d6a8d4..29705256727f 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -325,31 +325,6 @@ void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
 	spin_unlock(&block_rsv->lock);
 }
 
-int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
-			     struct btrfs_block_rsv *dest, u64 num_bytes,
-			     int min_factor)
-{
-	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
-	u64 min_bytes;
-
-	if (global_rsv->space_info != dest->space_info)
-		return -ENOSPC;
-
-	spin_lock(&global_rsv->lock);
-	min_bytes = div_factor(global_rsv->size, min_factor);
-	if (global_rsv->reserved < min_bytes + num_bytes) {
-		spin_unlock(&global_rsv->lock);
-		return -ENOSPC;
-	}
-	global_rsv->reserved -= num_bytes;
-	if (global_rsv->reserved < global_rsv->size)
-		global_rsv->full = false;
-	spin_unlock(&global_rsv->lock);
-
-	btrfs_block_rsv_add_bytes(dest, num_bytes, true);
-	return 0;
-}
-
 void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index 578c3497a455..7e9016a9e193 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -70,9 +70,6 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
 			    struct btrfs_block_rsv *dst_rsv, u64 num_bytes,
 			    bool update_size);
 int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes);
-int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
-			     struct btrfs_block_rsv *dest, u64 num_bytes,
-			     int min_factor);
 void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
 			       u64 num_bytes, bool update_size);
 u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
-- 
cgit 


From 43dd529abed2bcf1467f13cce83da1c8456587ea Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 14:21:42 +0200
Subject: btrfs: update function comments

Update, reformat or reword function comments. This also removes the kdoc
marker so we don't get reports when the function name is missing.

Changes made:

- remove kdoc markers
- reformat the brief description to be a proper sentence
- reword to imperative voice
- align parameter list
- fix typos

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c      |  20 ++++----
 fs/btrfs/ctree.c            |  11 +++--
 fs/btrfs/delalloc-space.c   |  59 +++++++++++------------
 fs/btrfs/delayed-ref.c      |  19 ++++----
 fs/btrfs/discard.c          | 111 +++++++++++++++++++++++++-------------------
 fs/btrfs/extent_io.c        |  59 ++++++++++++-----------
 fs/btrfs/extent_map.c       |  65 ++++++++++++--------------
 fs/btrfs/file-item.c        |  50 ++++++++++----------
 fs/btrfs/free-space-cache.c |   9 ++--
 fs/btrfs/inode.c            |  14 +++---
 fs/btrfs/ioctl.c            |   2 +-
 fs/btrfs/ordered-data.c     |  13 +++---
 fs/btrfs/raid56.c           |   4 +-
 fs/btrfs/reflink.c          |  18 +++----
 fs/btrfs/space-info.c       |  16 +++----
 fs/btrfs/tree-log.c         |   2 +-
 fs/btrfs/ulist.c            |  35 ++++++++------
 fs/btrfs/volumes.c          |  43 +++++++++--------
 fs/btrfs/zoned.c            |   6 +--
 fs/btrfs/zstd.c             |   2 +-
 20 files changed, 289 insertions(+), 269 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 5a743c4d4e97..cfe495d39ebd 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -299,7 +299,7 @@ struct btrfs_block_group *btrfs_next_block_group(
 	return cache;
 }
 
-/**
+/*
  * Check if we can do a NOCOW write for a given extent.
  *
  * @fs_info:       The filesystem information object.
@@ -340,11 +340,9 @@ struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
 	return bg;
 }
 
-/**
+/*
  * Decrement the number of NOCOW writers in a block group.
  *
- * @bg:       The block group.
- *
  * This is meant to be called after a previous call to btrfs_inc_nocow_writers(),
  * and on the block group returned by that call. Typically this is called after
  * creating an ordered extent for a NOCOW write, to prevent races with scrub and
@@ -1813,8 +1811,8 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 	write_sequnlock(&fs_info->profiles_lock);
 }
 
-/**
- * Map a physical disk address to a list of logical addresses
+/*
+ * Map a physical disk address to a list of logical addresses.
  *
  * @fs_info:       the filesystem
  * @chunk_start:   logical address of block group
@@ -3421,8 +3419,9 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-/**
- * btrfs_add_reserved_bytes - update the block_group and space info counters
+/*
+ * Update the block_group and space info counters.
+ *
  * @cache:	The cache we are manipulating
  * @ram_bytes:  The number of bytes of file content, and will be same to
  *              @num_bytes except for the compress path.
@@ -3465,8 +3464,9 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
 	return ret;
 }
 
-/**
- * btrfs_free_reserved_bytes - update the block_group and space info counters
+/*
+ * Update the block_group and space info counters.
+ *
  * @cache:      The cache we are manipulating
  * @num_bytes:  The number of bytes in question
  * @delalloc:   The blocks are allocated for the delalloc write
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index f0ddb44704ee..4b47d380da1e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2363,7 +2363,7 @@ int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
 	return ret;
 }
 
-/**
+/*
  * Search for a valid slot for the given path.
  *
  * @root:	The root node of the tree.
@@ -3985,14 +3985,15 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
 	}
 }
 
-/**
- * setup_items_for_insert - Helper called before inserting one or more items
- * to a leaf. Main purpose is to save stack depth by doing the bulk of the work
- * in a function that doesn't call btrfs_search_slot
+/*
+ * Make space in the node before inserting one or more items.
  *
  * @root:	root we are inserting items to
  * @path:	points to the leaf/slot where we are going to insert new items
  * @batch:      information about the batch of items to insert
+ *
+ * Main purpose is to save stack depth by doing the bulk of the work in a
+ * function that doesn't call btrfs_search_slot
  */
 static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
 				   const struct btrfs_item_batch *batch)
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 605d8874a446..7ddb1d104e8e 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -202,8 +202,8 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
 	btrfs_qgroup_free_data(inode, reserved, start, len);
 }
 
-/**
- * Release any excessive reservation
+/*
+ * Release any excessive reservations for an inode.
  *
  * @inode:       the inode we need to release from
  * @qgroup_free: free or convert qgroup meta. Unlike normal operation, qgroup
@@ -377,12 +377,12 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
 	return 0;
 }
 
-/**
- * Release a metadata reservation for an inode
+/*
+ * Release a metadata reservation for an inode.
  *
- * @inode: the inode to release the reservation for.
- * @num_bytes: the number of bytes we are releasing.
- * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
+ * @inode:        the inode to release the reservation for.
+ * @num_bytes:    the number of bytes we are releasing.
+ * @qgroup_free:  free qgroup reservation or convert it to per-trans reservation
  *
  * This will release the metadata reservation for an inode.  This can be called
  * once we complete IO for a given set of bytes to release their metadata
@@ -405,10 +405,11 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
 	btrfs_inode_rsv_release(inode, qgroup_free);
 }
 
-/**
- * btrfs_delalloc_release_extents - release our outstanding_extents
- * @inode: the inode to balance the reservation for.
- * @num_bytes: the number of bytes we originally reserved with
+/*
+ * Release our outstanding_extents for an inode.
+ *
+ * @inode:      the inode to balance the reservation for.
+ * @num_bytes:  the number of bytes we originally reserved with
  *
  * When we reserve space we increase outstanding_extents for the extents we may
  * add.  Once we've set the range as delalloc or created our ordered extents we
@@ -433,30 +434,30 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
 	btrfs_inode_rsv_release(inode, true);
 }
 
-/**
- * btrfs_delalloc_reserve_space - reserve data and metadata space for
- * delalloc
- * @inode: inode we're writing to
- * @start: start range we are writing to
- * @len: how long the range we are writing to
- * @reserved: mandatory parameter, record actually reserved qgroup ranges of
- * 	      current reservation.
+/*
+ * Reserve data and metadata space for delalloc
+ *
+ * @inode:     inode we're writing to
+ * @start:     start range we are writing to
+ * @len:       how long the range we are writing to
+ * @reserved:  mandatory parameter, record actually reserved qgroup ranges of
+ * 	       current reservation.
  *
  * This will do the following things
  *
- * - reserve space in data space info for num bytes
- *   and reserve precious corresponding qgroup space
+ * - reserve space in data space info for num bytes and reserve precious
+ *   corresponding qgroup space
  *   (Done in check_data_free_space)
  *
  * - reserve space for metadata space, based on the number of outstanding
- *   extents and how much csums will be needed
- *   also reserve metadata space in a per root over-reserve method.
+ *   extents and how much csums will be needed also reserve metadata space in a
+ *   per root over-reserve method.
  * - add to the inodes->delalloc_bytes
  * - add it to the fs_info's delalloc inodes list.
  *   (Above 3 all done in delalloc_reserve_metadata)
  *
  * Return 0 for success
- * Return <0 for error(-ENOSPC or -EQUOT)
+ * Return <0 for error(-ENOSPC or -EDQUOT)
  */
 int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
 			struct extent_changeset **reserved, u64 start, u64 len)
@@ -475,7 +476,7 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
 	return ret;
 }
 
-/**
+/*
  * Release data and metadata space for delalloc
  *
  * @inode:       inode we're releasing space for
@@ -484,10 +485,10 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
  * @len:         length of the space already reserved
  * @qgroup_free: should qgroup reserved-space also be freed
  *
- * This function will release the metadata space that was not used and will
- * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
- * list if there are no delalloc bytes left.
- * Also it will handle the qgroup reserved space.
+ * Release the metadata space that was not used and will decrement
+ * ->delalloc_bytes and remove it from the fs_info->delalloc_inodes list if
+ * there are no delalloc bytes left.  Also it will handle the qgroup reserved
+ * space.
  */
 void btrfs_delalloc_release_space(struct btrfs_inode *inode,
 				  struct extent_changeset *reserved,
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 010cc16297d8..573ebab886e2 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -71,14 +71,14 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
 	return btrfs_check_space_for_delayed_refs(trans->fs_info);
 }
 
-/**
- * Release a ref head's reservation
+/*
+ * Release a ref head's reservation.
  *
  * @fs_info:  the filesystem
  * @nr:       number of items to drop
  *
- * This drops the delayed ref head's count from the delayed refs rsv and frees
- * any excess reservation we had.
+ * Drops the delayed ref head's count from the delayed refs rsv and free any
+ * excess reservation we had.
  */
 void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
 {
@@ -104,8 +104,7 @@ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
 }
 
 /*
- * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv
- * @trans - the trans that may have generated delayed refs
+ * Adjust the size of the delayed refs rsv.
  *
  * This is to be called anytime we may have adjusted trans->delayed_ref_updates,
  * it'll calculate the additional size and add it to the delayed_refs_rsv.
@@ -139,8 +138,8 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
 	trans->delayed_ref_updates = 0;
 }
 
-/**
- * Transfer bytes to our delayed refs rsv
+/*
+ * Transfer bytes to our delayed refs rsv.
  *
  * @fs_info:   the filesystem
  * @src:       source block rsv to transfer from
@@ -188,8 +187,8 @@ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
 				delayed_refs_rsv->space_info, to_free);
 }
 
-/**
- * Refill based on our delayed refs usage
+/*
+ * Refill based on our delayed refs usage.
  *
  * @fs_info: the filesystem
  * @flush:   control how we can flush for this reservation.
diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
index 51f0ef386046..ff2e524d9937 100644
--- a/fs/btrfs/discard.c
+++ b/fs/btrfs/discard.c
@@ -62,7 +62,7 @@
 #define BTRFS_DISCARD_MAX_DELAY_MSEC	(1000UL)
 #define BTRFS_DISCARD_MAX_IOPS		(10U)
 
-/* Montonically decreasing minimum length filters after index 0 */
+/* Monotonically decreasing minimum length filters after index 0 */
 static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = {
 	0,
 	BTRFS_ASYNC_DISCARD_MAX_FILTER,
@@ -147,10 +147,11 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
 	return running;
 }
 
-/**
- * find_next_block_group - find block_group that's up next for discarding
- * @discard_ctl: discard control
- * @now: current time
+/*
+ * Find block_group that's up next for discarding.
+ *
+ * @discard_ctl:  discard control
+ * @now:          current time
  *
  * Iterate over the discard lists to find the next block_group up for
  * discarding checking the discard_eligible_time of block_group.
@@ -185,17 +186,17 @@ static struct btrfs_block_group *find_next_block_group(
 	return ret_block_group;
 }
 
-/**
- * Wrap find_next_block_group()
+/*
+ * Look up next block group and set it for use.
  *
  * @discard_ctl:   discard control
  * @discard_state: the discard_state of the block_group after state management
  * @discard_index: the discard_index of the block_group after state management
  * @now:           time when discard was invoked, in ns
  *
- * This wraps find_next_block_group() and sets the block_group to be in use.
- * discard_state's control flow is managed here.  Variables related to
- * discard_state are reset here as needed (eg discard_cursor).  @discard_state
+ * Wrap find_next_block_group() and set the block_group to be in use.
+ * @discard_state's control flow is managed here.  Variables related to
+ * @discard_state are reset here as needed (eg. @discard_cursor).  @discard_state
  * and @discard_index are remembered as it may change while we're discarding,
  * but we want the discard to execute in the context determined here.
  */
@@ -234,10 +235,11 @@ again:
 	return block_group;
 }
 
-/**
- * btrfs_discard_check_filter - updates a block groups filters
- * @block_group: block group of interest
- * @bytes: recently freed region size after coalescing
+/*
+ * Update a block group's filters.
+ *
+ * @block_group:  block group of interest
+ * @bytes:        recently freed region size after coalescing
  *
  * Async discard maintains multiple lists with progressively smaller filters
  * to prioritize discarding based on size.  Should a free space that matches
@@ -272,8 +274,9 @@ void btrfs_discard_check_filter(struct btrfs_block_group *block_group,
 	}
 }
 
-/**
- * btrfs_update_discard_index - moves a block group along the discard lists
+/*
+ * Move a block group along the discard lists.
+ *
  * @discard_ctl: discard control
  * @block_group: block_group of interest
  *
@@ -292,13 +295,14 @@ static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl,
 	add_to_discard_list(discard_ctl, block_group);
 }
 
-/**
- * btrfs_discard_cancel_work - remove a block_group from the discard lists
+/*
+ * Remove a block_group from the discard lists.
+ *
  * @discard_ctl: discard control
  * @block_group: block_group of interest
  *
- * This removes @block_group from the discard lists.  If necessary, it waits on
- * the current work and then reschedules the delayed work.
+ * Remove @block_group from the discard lists.  If necessary, wait on the
+ * current work and then reschedule the delayed work.
  */
 void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
 			       struct btrfs_block_group *block_group)
@@ -309,12 +313,13 @@ void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
 	}
 }
 
-/**
- * btrfs_discard_queue_work - handles queuing the block_groups
+/*
+ * Handles queuing the block_groups.
+ *
  * @discard_ctl: discard control
  * @block_group: block_group of interest
  *
- * This maintains the LRU order of the discard lists.
+ * Maintain the LRU order of the discard lists.
  */
 void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
 			      struct btrfs_block_group *block_group)
@@ -384,7 +389,8 @@ static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
 }
 
 /*
- * btrfs_discard_schedule_work - responsible for scheduling the discard work
+ * Responsible for scheduling the discard work.
+ *
  * @discard_ctl:  discard control
  * @override:     override the current timer
  *
@@ -402,15 +408,16 @@ void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
 	spin_unlock(&discard_ctl->lock);
 }
 
-/**
- * btrfs_finish_discard_pass - determine next step of a block_group
+/*
+ * Determine next step of a block_group.
+ *
  * @discard_ctl: discard control
  * @block_group: block_group of interest
  *
- * This determines the next step for a block group after it's finished going
- * through a pass on a discard list.  If it is unused and fully trimmed, we can
- * mark it unused and send it to the unused_bgs path.  Otherwise, pass it onto
- * the appropriate filter list or let it fall off.
+ * Determine the next step for a block group after it's finished going through
+ * a pass on a discard list.  If it is unused and fully trimmed, we can mark it
+ * unused and send it to the unused_bgs path.  Otherwise, pass it onto the
+ * appropriate filter list or let it fall off.
  */
 static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl,
 				      struct btrfs_block_group *block_group)
@@ -427,12 +434,13 @@ static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl,
 	}
 }
 
-/**
- * btrfs_discard_workfn - discard work function
+/*
+ * Discard work queue callback
+ *
  * @work: work
  *
- * This finds the next block_group to start discarding and then discards a
- * single region.  It does this in a two-pass fashion: first extents and second
+ * Find the next block_group to start discarding and then discard a single
+ * region.  It does this in a two-pass fashion: first extents and second
  * bitmaps.  Completely discarded block groups are sent to the unused_bgs path.
  */
 static void btrfs_discard_workfn(struct work_struct *work)
@@ -508,11 +516,12 @@ static void btrfs_discard_workfn(struct work_struct *work)
 	spin_unlock(&discard_ctl->lock);
 }
 
-/**
- * btrfs_run_discard_work - determines if async discard should be running
+/*
+ * Determine if async discard should be running.
+ *
  * @discard_ctl: discard control
  *
- * Checks if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set.
+ * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set.
  */
 bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl)
 {
@@ -524,8 +533,9 @@ bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl)
 		test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags));
 }
 
-/**
- * btrfs_discard_calc_delay - recalculate the base delay
+/*
+ * Recalculate the base delay.
+ *
  * @discard_ctl: discard control
  *
  * Recalculate the base delay which is based off the total number of
@@ -546,7 +556,7 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl)
 	spin_lock(&discard_ctl->lock);
 
 	/*
-	 * The following is to fix a potential -1 discrepenancy that we're not
+	 * The following is to fix a potential -1 discrepancy that we're not
 	 * sure how to reproduce. But given that this is the only place that
 	 * utilizes these numbers and this is only called by from
 	 * btrfs_finish_extent_commit() which is synchronized, we can correct
@@ -579,13 +589,14 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl)
 	spin_unlock(&discard_ctl->lock);
 }
 
-/**
- * btrfs_discard_update_discardable - propagate discard counters
+/*
+ * Propagate discard counters.
+ *
  * @block_group: block_group of interest
  *
- * This propagates deltas of counters up to the discard_ctl.  It maintains a
- * current counter and a previous counter passing the delta up to the global
- * stat.  Then the current counter value becomes the previous counter value.
+ * Propagate deltas of counters up to the discard_ctl.  It maintains a current
+ * counter and a previous counter passing the delta up to the global stat.
+ * Then the current counter value becomes the previous counter value.
  */
 void btrfs_discard_update_discardable(struct btrfs_block_group *block_group)
 {
@@ -620,8 +631,9 @@ void btrfs_discard_update_discardable(struct btrfs_block_group *block_group)
 	}
 }
 
-/**
- * btrfs_discard_punt_unused_bgs_list - punt unused_bgs list to discard lists
+/*
+ * Punt unused_bgs list to discard lists.
+ *
  * @fs_info: fs_info of interest
  *
  * The unused_bgs list needs to be punted to the discard lists because the
@@ -645,8 +657,9 @@ void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info)
 	spin_unlock(&fs_info->unused_bgs_lock);
 }
 
-/**
- * btrfs_discard_purge_list - purge discard lists
+/*
+ * Purge discard lists.
+ *
  * @discard_ctl: discard control
  *
  * If we are disabling async discard, we may have intercepted block groups that
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4b47bb8c590f..545d9e1f0f83 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1298,7 +1298,7 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio)
 	bio_put(bio);
 }
 
-/**
+/*
  * Populate every free slot in a provided array with pages.
  *
  * @nr_pages:   number of pages to allocate
@@ -1334,16 +1334,16 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
 	return 0;
 }
 
-/**
- * Attempt to add a page to bio
+/*
+ * Attempt to add a page to bio.
  *
- * @bio_ctrl:	record both the bio, and its bio_flags
- * @page:	page to add to the bio
- * @disk_bytenr:  offset of the new bio or to check whether we are adding
- *                a contiguous page to the previous one
- * @size:	portion of page that we want to write
- * @pg_offset:	starting offset in the page
- * @compress_type:   compression type of the current bio to see if we can merge them
+ * @bio_ctrl:       record both the bio, and its bio_flags
+ * @page:	    page to add to the bio
+ * @disk_bytenr:    offset of the new bio or to check whether we are adding
+ *                  a contiguous page to the previous one
+ * @size:	    portion of page that we want to write
+ * @pg_offset:	    starting offset in the page
+ * @compress_type:  compression type of the current bio to see if we can merge them
  *
  * Attempt to add a page to bio considering stripe alignment etc.
  *
@@ -3066,7 +3066,7 @@ retry:
 	return ret;
 }
 
-/**
+/*
  * Walk the list of dirty pages of the given address space and write all of them.
  *
  * @mapping: address space structure to write
@@ -5460,11 +5460,12 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb,
 	*page_offset = offset_in_page(offset);
 }
 
-/**
- * extent_buffer_test_bit - determine whether a bit in a bitmap item is set
- * @eb: the extent buffer
- * @start: offset of the bitmap item in the extent buffer
- * @nr: bit number to test
+/*
+ * Determine whether a bit in a bitmap item is set.
+ *
+ * @eb:     the extent buffer
+ * @start:  offset of the bitmap item in the extent buffer
+ * @nr:     bit number to test
  */
 int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
 			   unsigned long nr)
@@ -5481,12 +5482,13 @@ int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
 	return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
 }
 
-/**
- * extent_buffer_bitmap_set - set an area of a bitmap
- * @eb: the extent buffer
- * @start: offset of the bitmap item in the extent buffer
- * @pos: bit number of the first bit
- * @len: number of bits to set
+/*
+ * Set an area of a bitmap to 1.
+ *
+ * @eb:     the extent buffer
+ * @start:  offset of the bitmap item in the extent buffer
+ * @pos:    bit number of the first bit
+ * @len:    number of bits to set
  */
 void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
 			      unsigned long pos, unsigned long len)
@@ -5523,12 +5525,13 @@ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long star
 }
 
 
-/**
- * extent_buffer_bitmap_clear - clear an area of a bitmap
- * @eb: the extent buffer
- * @start: offset of the bitmap item in the extent buffer
- * @pos: bit number of the first bit
- * @len: number of bits to clear
+/*
+ * Clear an area of a bitmap.
+ *
+ * @eb:     the extent buffer
+ * @start:  offset of the bitmap item in the extent buffer
+ * @pos:    bit number of the first bit
+ * @len:    number of bits to clear
  */
 void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
 				unsigned long start, unsigned long pos,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index f97508afb659..cd45303f344d 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -28,12 +28,9 @@ void __cold extent_map_exit(void)
 	kmem_cache_destroy(extent_map_cache);
 }
 
-/**
- * extent_map_tree_init - initialize extent map tree
- * @tree:		tree to initialize
- *
- * Initialize the extent tree @tree.  Should be called for each new inode
- * or other user of the extent_map interface.
+/*
+ * Initialize the extent tree @tree.  Should be called for each new inode or
+ * other user of the extent_map interface.
  */
 void extent_map_tree_init(struct extent_map_tree *tree)
 {
@@ -42,12 +39,9 @@ void extent_map_tree_init(struct extent_map_tree *tree)
 	rwlock_init(&tree->lock);
 }
 
-/**
- * alloc_extent_map - allocate new extent map structure
- *
- * Allocate a new extent_map structure.  The new structure is
- * returned with a reference count of one and needs to be
- * freed using free_extent_map()
+/*
+ * Allocate a new extent_map structure.  The new structure is returned with a
+ * reference count of one and needs to be freed using free_extent_map()
  */
 struct extent_map *alloc_extent_map(void)
 {
@@ -62,12 +56,9 @@ struct extent_map *alloc_extent_map(void)
 	return em;
 }
 
-/**
- * free_extent_map - drop reference count of an extent_map
- * @em:		extent map being released
- *
- * Drops the reference out on @em by one and free the structure
- * if the reference count hits zero.
+/*
+ * Drop the reference out on @em by one and free the structure if the reference
+ * count hits zero.
  */
 void free_extent_map(struct extent_map *em)
 {
@@ -82,7 +73,7 @@ void free_extent_map(struct extent_map *em)
 	}
 }
 
-/* simple helper to do math around the end of an extent, handling wrap */
+/* Do the math around the end of an extent, handling wrapping. */
 static u64 range_end(u64 start, u64 len)
 {
 	if (start + len < start)
@@ -138,8 +129,8 @@ static int tree_insert(struct rb_root_cached *root, struct extent_map *em)
 }
 
 /*
- * search through the tree for an extent_map with a given offset.  If
- * it can't be found, try to find some neighboring extents
+ * Search through the tree for an extent_map with a given offset.  If it can't
+ * be found, try to find some neighboring extents
  */
 static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
 				     struct rb_node **prev_or_next_ret)
@@ -191,7 +182,7 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
 	return NULL;
 }
 
-/* check to see if two extent_map structs are adjacent and safe to merge */
+/* Check to see if two extent_map structs are adjacent and safe to merge. */
 static int mergable_maps(struct extent_map *prev, struct extent_map *next)
 {
 	if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
@@ -289,8 +280,9 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
 	}
 }
 
-/**
- * unpin_extent_cache - unpin an extent from the cache
+/*
+ * Unpin an extent from the cache.
+ *
  * @tree:	tree to unpin the extent in
  * @start:	logical offset in the file
  * @len:	length of the extent
@@ -393,7 +385,7 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits)
 	}
 }
 
-/**
+/*
  * Add new extent map to the extent tree
  *
  * @tree:	tree to insert new map in
@@ -452,8 +444,9 @@ __lookup_extent_mapping(struct extent_map_tree *tree,
 	return em;
 }
 
-/**
- * lookup_extent_mapping - lookup extent_map
+/*
+ * Lookup extent_map that intersects @start + @len range.
+ *
  * @tree:	tree to lookup in
  * @start:	byte offset to start the search
  * @len:	length of the lookup range
@@ -469,8 +462,9 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
 	return __lookup_extent_mapping(tree, start, len, 1);
 }
 
-/**
- * search_extent_mapping - find a nearby extent map
+/*
+ * Find a nearby extent map intersecting @start + @len (not an exact search).
+ *
  * @tree:	tree to lookup in
  * @start:	byte offset to start the search
  * @len:	length of the lookup range
@@ -486,13 +480,14 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
 	return __lookup_extent_mapping(tree, start, len, 0);
 }
 
-/**
- * remove_extent_mapping - removes an extent_map from the extent tree
+/*
+ * Remove an extent_map from the extent tree.
+ *
  * @tree:	extent tree to remove from
  * @em:		extent map being removed
  *
- * Removes @em from @tree.  No reference counts are dropped, and no checks
- * are done to see if the range is in use
+ * Remove @em from @tree.  No reference counts are dropped, and no checks
+ * are done to see if the range is in use.
  */
 void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
 {
@@ -615,8 +610,8 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
 	return add_extent_mapping(em_tree, em, 0);
 }
 
-/**
- * Add extent mapping into em_tree
+/*
+ * Add extent mapping into em_tree.
  *
  * @fs_info:  the filesystem
  * @em_tree:  extent tree into which we want to insert the extent mapping
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index bce6c8d31bc0..403a857d230e 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -27,8 +27,8 @@
 #define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \
 				       PAGE_SIZE))
 
-/**
- * Set inode's size according to filesystem options
+/*
+ * Set inode's size according to filesystem options.
  *
  * @inode:      inode we want to update the disk_i_size for
  * @new_i_size: i_size we want to set to, 0 if we use i_size
@@ -67,8 +67,8 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz
 	spin_unlock(&inode->lock);
 }
 
-/**
- * Mark range within a file as having a new extent inserted
+/*
+ * Mark range within a file as having a new extent inserted.
  *
  * @inode: inode being modified
  * @start: start file offset of the file extent we've inserted
@@ -95,8 +95,8 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
 			       EXTENT_DIRTY);
 }
 
-/**
- * Marks an inode range as not having a backing extent
+/*
+ * Mark an inode range as not having a backing extent.
  *
  * @inode: inode being modified
  * @start: start file offset of the file extent we've inserted
@@ -257,7 +257,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 
 /*
  * Find checksums for logical bytenr range [disk_bytenr, disk_bytenr + len) and
- * estore the result to @dst.
+ * store the result to @dst.
  *
  * Return >0 for the number of sectors we found.
  * Return 0 for the range [disk_bytenr, disk_bytenr + sectorsize) has no csum
@@ -363,15 +363,15 @@ static int search_file_offset_in_bio(struct bio *bio, struct inode *inode,
 	return ret;
 }
 
-/**
+/*
  * Lookup the checksum for the read bio in csum tree.
  *
- * @inode: inode that the bio is for.
- * @bio: bio to look up.
- * @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return
- *       checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If
- *       NULL, the checksum buffer is allocated and returned in
- *       btrfs_bio(bio)->csum instead.
+ * @inode:  inode that the bio is for.
+ * @bio:    bio to look up.
+ * @dst:    Buffer of size nblocks * btrfs_super_csum_size() used to return
+ *          checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If
+ *          NULL, the checksum buffer is allocated and returned in
+ *          btrfs_bio(bio)->csum instead.
  *
  * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise.
  */
@@ -633,8 +633,8 @@ fail:
 	return ret;
 }
 
-/**
- * Calculate checksums of the data contained inside a bio
+/*
+ * Calculate checksums of the data contained inside a bio.
  *
  * @inode:	 Owner of the data inside the bio
  * @bio:	 Contains the data to be checksummed
@@ -749,15 +749,16 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
 }
 
 /*
- * helper function for csum removal, this expects the
- * key to describe the csum pointed to by the path, and it expects
- * the csum to overlap the range [bytenr, len]
+ * Remove one checksum overlapping a range.
+ *
+ * This expects the key to describe the csum pointed to by the path, and it
+ * expects the csum to overlap the range [bytenr, len]
  *
- * The csum should not be entirely contained in the range and the
- * range should not be entirely contained in the csum.
+ * The csum should not be entirely contained in the range and the range should
+ * not be entirely contained in the csum.
  *
- * This calls btrfs_truncate_item with the correct args based on the
- * overlap, and fixes up the key as required.
+ * This calls btrfs_truncate_item with the correct args based on the overlap,
+ * and fixes up the key as required.
  */
 static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info,
 				       struct btrfs_path *path,
@@ -806,8 +807,7 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info,
 }
 
 /*
- * deletes the csum items from the csum tree for a given
- * range of bytes.
+ * Delete the csum items from the csum tree for a given range of bytes.
  */
 int btrfs_del_csums(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, u64 bytenr, u64 len)
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 26ff1a5100b9..599b41523479 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1369,8 +1369,8 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
 				     path, block_group->start);
 }
 
-/**
- * Write out cached info to an inode
+/*
+ * Write out cached info to an inode.
  *
  * @root:        root the inode belongs to
  * @inode:       freespace inode we are writing out
@@ -3034,10 +3034,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
 
 }
 
-/**
- * btrfs_is_free_space_trimmed - see if everything is trimmed
- * @block_group: block_group of interest
- *
+/*
  * Walk @block_group's free space rb_tree to determine if everything is trimmed.
  */
 bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ec9f0f4afd12..5b9857e30a08 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6895,18 +6895,18 @@ static noinline int uncompress_inline(struct btrfs_path *path,
 	return ret;
 }
 
-/**
- * btrfs_get_extent - Lookup the first extent overlapping a range in a file.
+/*
+ * Lookup the first extent overlapping a range in a file.
+ *
  * @inode:	file to search in
  * @page:	page to read extent data into if the extent is inline
  * @pg_offset:	offset into @page to copy to
  * @start:	file offset
  * @len:	length of range starting at @start
  *
- * This returns the first &struct extent_map which overlaps with the given
- * range, reading it from the B-tree and caching it if necessary. Note that
- * there may be more extents which overlap the given range after the returned
- * extent_map.
+ * Return the first &struct extent_map which overlaps the given range, reading
+ * it from the B-tree and caching it if necessary. Note that there may be more
+ * extents which overlap the given range after the returned extent_map.
  *
  * If @page is not NULL and the extent is inline, this also reads the extent
  * data directly into the page and marks the extent up to date in the io_tree.
@@ -11310,7 +11310,7 @@ void btrfs_update_inode_bytes(struct btrfs_inode *inode,
 	spin_unlock(&inode->lock);
 }
 
-/**
+/*
  * Verify that there are no ordered extents for a given file range.
  *
  * @inode:   The target inode.
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 52a268e05d4a..96858240588d 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4346,7 +4346,7 @@ void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
 	spin_unlock(&fs_info->balance_lock);
 }
 
-/**
+/*
  * Try to acquire fs_info::balance_mutex as well as set BTRFS_EXLCOP_BALANCE as
  * required.
  *
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 1cbaacdc50da..1c36407803ca 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -144,7 +144,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
 	return ret;
 }
 
-/**
+/*
  * Add an ordered extent to the per-inode tree.
  *
  * @inode:           Inode that this extent is for.
@@ -1020,17 +1020,18 @@ out:
 }
 
 /*
- * btrfs_flush_ordered_range - Lock the passed range and ensures all pending
- * ordered extents in it are run to completion.
+ * Lock the passed range and ensures all pending ordered extents in it are run
+ * to completion.
  *
  * @inode:        Inode whose ordered tree is to be searched
  * @start:        Beginning of range to flush
  * @end:          Last byte of range to lock
  * @cached_state: If passed, will return the extent state responsible for the
- * locked range. It's the caller's responsibility to free the cached state.
+ *                locked range. It's the caller's responsibility to free the
+ *                cached state.
  *
- * This function always returns with the given range locked, ensuring after it's
- * called no order extent can be pending.
+ * Always return with the given range locked, ensuring after it's called no
+ * order extent can be pending.
  */
 void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
 					u64 end,
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 349270537c2e..ef272e7d932c 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -908,8 +908,8 @@ static void raid_write_end_io(struct bio *bio)
 	rbio_orig_end_io(rbio, err);
 }
 
-/**
- * Get a sector pointer specified by its @stripe_nr and @sector_nr
+/*
+ * Get a sector pointer specified by its @stripe_nr and @sector_nr.
  *
  * @rbio:               The raid bio
  * @stripe_nr:          Stripe number, valid range [0, real_stripe)
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index f0243eb33df7..9ba7914c7169 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -321,16 +321,16 @@ copy_to_page:
 	goto out;
 }
 
-/**
- * btrfs_clone() - clone a range from inode file to another
+/*
+ * Clone a range from inode file to another.
  *
- * @src: Inode to clone from
- * @inode: Inode to clone to
- * @off: Offset within source to start clone from
- * @olen: Original length, passed by user, of range to clone
- * @olen_aligned: Block-aligned value of olen
- * @destoff: Offset within @inode to start clone
- * @no_time_update: Whether to update mtime/ctime on the target inode
+ * @src:             Inode to clone from
+ * @inode:           Inode to clone to
+ * @off:             Offset within source to start clone from
+ * @olen:            Original length, passed by user, of range to clone
+ * @olen_aligned:    Block-aligned value of olen
+ * @destoff:         Offset within @inode to start clone
+ * @no_time_update:  Whether to update mtime/ctime on the target inode
  */
 static int btrfs_clone(struct inode *src, struct inode *inode,
 		       const u64 off, const u64 olen, const u64 olen_aligned,
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index e5f9f43ffa4c..11e5b5a1eb5a 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -1493,8 +1493,8 @@ static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
 	spin_unlock(&space_info->lock);
 }
 
-/**
- * Do the appropriate flushing and waiting for a ticket
+/*
+ * Do the appropriate flushing and waiting for a ticket.
  *
  * @fs_info:    the filesystem
  * @space_info: space info for the reservation
@@ -1596,8 +1596,8 @@ static inline bool can_ticket(enum btrfs_reserve_flush_enum flush)
 		flush != BTRFS_RESERVE_FLUSH_EMERGENCY);
 }
 
-/**
- * Try to reserve bytes from the block_rsv's space
+/*
+ * Try to reserve bytes from the block_rsv's space.
  *
  * @fs_info:    the filesystem
  * @space_info: space info we want to allocate from
@@ -1736,8 +1736,8 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
 				     orig_bytes, flush);
 }
 
-/**
- * Trye to reserve metadata bytes from the block_rsv's space
+/*
+ * Try to reserve metadata bytes from the block_rsv's space.
  *
  * @fs_info:    the filesystem
  * @block_rsv:  block_rsv we're allocating for
@@ -1771,8 +1771,8 @@ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
 	return ret;
 }
 
-/**
- * Try to reserve data bytes for an allocation
+/*
+ * Try to reserve data bytes for an allocation.
  *
  * @fs_info: the filesystem
  * @bytes:   number of bytes we need
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f185cd286243..e358df3668e7 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -7381,7 +7381,7 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
 	mutex_unlock(&dir->log_mutex);
 }
 
-/**
+/*
  * Update the log after adding a new name for an inode.
  *
  * @trans:              Transaction handle.
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index f2f20c8d84aa..13af1e41f9d7 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -38,8 +38,9 @@
  * loop would be similar to the above.
  */
 
-/**
- * ulist_init - freshly initialize a ulist
+/*
+ * Freshly initialize a ulist.
+ *
  * @ulist:	the ulist to initialize
  *
  * Note: don't use this function to init an already used ulist, use
@@ -52,8 +53,9 @@ void ulist_init(struct ulist *ulist)
 	ulist->nnodes = 0;
 }
 
-/**
- * ulist_release - free up additionally allocated memory for the ulist
+/*
+ * Free up additionally allocated memory for the ulist.
+ *
  * @ulist:	the ulist from which to free the additional memory
  *
  * This is useful in cases where the base 'struct ulist' has been statically
@@ -71,8 +73,9 @@ void ulist_release(struct ulist *ulist)
 	INIT_LIST_HEAD(&ulist->nodes);
 }
 
-/**
- * ulist_reinit - prepare a ulist for reuse
+/*
+ * Prepare a ulist for reuse.
+ *
  * @ulist:	ulist to be reused
  *
  * Free up all additional memory allocated for the list elements and reinit
@@ -84,8 +87,9 @@ void ulist_reinit(struct ulist *ulist)
 	ulist_init(ulist);
 }
 
-/**
- * ulist_alloc - dynamically allocate a ulist
+/*
+ * Dynamically allocate a ulist.
+ *
  * @gfp_mask:	allocation flags to for base allocation
  *
  * The allocated ulist will be returned in an initialized state.
@@ -102,8 +106,9 @@ struct ulist *ulist_alloc(gfp_t gfp_mask)
 	return ulist;
 }
 
-/**
- * ulist_free - free dynamically allocated ulist
+/*
+ * Free dynamically allocated ulist.
+ *
  * @ulist:	ulist to free
  *
  * It is not necessary to call ulist_release before.
@@ -164,8 +169,9 @@ static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins)
 	return 0;
 }
 
-/**
- * ulist_add - add an element to the ulist
+/*
+ * Add an element to the ulist.
+ *
  * @ulist:	ulist to add the element to
  * @val:	value to add to ulist
  * @aux:	auxiliary value to store along with val
@@ -243,8 +249,9 @@ int ulist_del(struct ulist *ulist, u64 val, u64 aux)
 	return 0;
 }
 
-/**
- * ulist_next - iterate ulist
+/*
+ * Iterate ulist.
+ *
  * @ulist:	ulist to iterate
  * @uiter:	iterator variable, initialized with ULIST_ITER_INIT(&iterator)
  *
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 767c56be6b96..f61bb79d4a7e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -531,14 +531,14 @@ error:
 	return ret;
 }
 
-/**
- *  Search and remove all stale devices (which are not mounted).
- *  When both inputs are NULL, it will search and release all stale devices.
+/*
+ *  Search and remove all stale devices (which are not mounted).  When both
+ *  inputs are NULL, it will search and release all stale devices.
  *
- *  @devt:	Optional. When provided will it release all unmounted devices
- *		matching this devt only.
+ *  @devt:         Optional. When provided will it release all unmounted devices
+ *                 matching this devt only.
  *  @skip_device:  Optional. Will skip this device when searching for the stale
- *		devices.
+ *                 devices.
  *
  *  Return:	0 for success or if @devt is 0.
  *		-EBUSY if @devt is a mounted device.
@@ -1478,8 +1478,9 @@ static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
 	return changed;
 }
 
-/**
- * dev_extent_hole_check - check if specified hole is suitable for allocation
+/*
+ * Check if specified hole is suitable for allocation.
+ *
  * @device:	the device which we have the hole
  * @hole_start: starting position of the hole
  * @hole_size:	the size of the hole
@@ -1533,7 +1534,8 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
 }
 
 /*
- * find_free_dev_extent_start - find free space in the specified device
+ * Find free space in the specified device.
+ *
  * @device:	  the device which we search the free space in
  * @num_bytes:	  the size of the free space that we need
  * @search_start: the position from which to begin the search
@@ -1541,9 +1543,8 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
  * @len:	  the size of the free space. that we find, or the size
  *		  of the max free space if we don't find suitable free space
  *
- * this uses a pretty simple search, the expectation is that it is
- * called very infrequently and that a given device has a small number
- * of extents
+ * This does a pretty simple search, the expectation is that it is called very
+ * infrequently and that a given device has a small number of extents.
  *
  * @start is used to store the start of the free space if we find. But if we
  * don't find suitable free space, it will be used to store the start position
@@ -2322,8 +2323,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
 	btrfs_free_device(tgtdev);
 }
 
-/**
- * Populate args from device at path
+/*
+ * Populate args from device at path.
  *
  * @fs_info:	the filesystem
  * @args:	the args to populate
@@ -4031,10 +4032,11 @@ error:
 	return ret;
 }
 
-/**
- * alloc_profile_is_valid - see if a given profile is valid and reduced
- * @flags: profile to validate
- * @extended: if true @flags is treated as an extended profile
+/*
+ * See if a given profile is valid and reduced.
+ *
+ * @flags:     profile to validate
+ * @extended:  if true @flags is treated as an extended profile
  */
 static int alloc_profile_is_valid(u64 flags, int extended)
 {
@@ -7009,8 +7011,9 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
 	return device;
 }
 
-/**
- * btrfs_alloc_device - allocate struct btrfs_device
+/*
+ * Allocate new device struct, set up devid and UUID.
+ *
  * @fs_info:	used only for generating a new devid, can be NULL if
  *		devid is provided (i.e. @devid != NULL).
  * @devid:	a pointer to devid for this device.  If NULL a new devid
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index fe66cb929af9..13034c3bbb65 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1020,8 +1020,8 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
 				zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
 }
 
-/**
- * btrfs_find_allocatable_zones - find allocatable zones within a given region
+/*
+ * Find allocatable zones within a given region.
  *
  * @device:	the device to allocate a region on
  * @hole_start: the position of the hole to allocate the region
@@ -1864,7 +1864,7 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
 	return device;
 }
 
-/**
+/*
  * Activate block group and underlying device zones
  *
  * @block_group: the block group to activate
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 35a0224d4eb7..4575b3703e74 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -94,7 +94,7 @@ static inline struct workspace *list_to_workspace(struct list_head *list)
 void zstd_free_workspace(struct list_head *ws);
 struct list_head *zstd_alloc_workspace(unsigned int level);
 
-/**
+/*
  * Timer callback to free unused workspaces.
  *
  * @t: timer
-- 
cgit 


From cb9a10a6504bb35095e97f4839fcf353bf1458ea Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 26 Oct 2022 15:08:16 -0400
Subject: btrfs: convert discard stat defs to enum

Do away with the defines and use an enum as it's cleaner.

Suggested-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/free-space-cache.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index cab954a9d97b..a855e0483e03 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -48,9 +48,11 @@ static inline bool btrfs_free_space_trimming_bitmap(
  * to make it clear what we're doing.  An example is discard_extents in
  * btrfs_free_space_ctl.
  */
-#define BTRFS_STAT_NR_ENTRIES	2
-#define BTRFS_STAT_CURR		0
-#define BTRFS_STAT_PREV		1
+enum {
+	BTRFS_STAT_CURR,
+	BTRFS_STAT_PREV,
+	BTRFS_STAT_NR_ENTRIES,
+};
 
 struct btrfs_free_space_ctl {
 	spinlock_t tree_lock;
-- 
cgit 


From b31bed170d5241d4c9e2fc572f31fc15b274a6c9 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 26 Oct 2022 15:08:17 -0400
Subject: btrfs: move btrfs_chunk_item_size out of ctree.h

This is used by the volumes code and the tree checker code.  We want to
maintain inline however, so simply move it to volumes.h.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h   | 7 -------
 fs/btrfs/volumes.h | 8 ++++++++
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 09005bd9bfef..1e0944c37547 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -54,13 +54,6 @@ struct btrfs_balance_control;
 struct btrfs_delayed_root;
 struct reloc_control;
 
-static inline unsigned long btrfs_chunk_item_size(int num_stripes)
-{
-	BUG_ON(num_stripes == 0);
-	return sizeof(struct btrfs_chunk) +
-		sizeof(struct btrfs_stripe) * (num_stripes - 1);
-}
-
 /* Read ahead values for struct btrfs_path.reada */
 enum {
 	READA_NONE,
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index a20ee7d57831..b05124e62412 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -10,6 +10,7 @@
 #include <linux/sort.h>
 #include <linux/btrfs.h>
 #include "async-thread.h"
+#include "messages.h"
 
 #define BTRFS_MAX_DATA_CHUNK_SIZE	(10ULL * SZ_1G)
 
@@ -605,6 +606,13 @@ static inline enum btrfs_map_op btrfs_op(struct bio *bio)
 	}
 }
 
+static inline unsigned long btrfs_chunk_item_size(int num_stripes)
+{
+	ASSERT(num_stripes);
+	return sizeof(struct btrfs_chunk) +
+		sizeof(struct btrfs_stripe) * (num_stripes - 1);
+}
+
 void btrfs_get_bioc(struct btrfs_io_context *bioc);
 void btrfs_put_bioc(struct btrfs_io_context *bioc);
 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
-- 
cgit 


From 3683fbbc2314d6721a353f8bf4285a819ea9b512 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 26 Oct 2022 15:08:18 -0400
Subject: btrfs: add dependencies to fs.h and block-rsv.h

There's several structures that are embedded inside of fs_info.h, so if
we don't have all the proper includes when we include fs.h we'll get a
variety of compile errors.  I fixed this by adding a temporary c file
that just had #include "fs.h" and then added include files until the
compiler stopped complaining.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-rsv.h | 1 +
 fs/btrfs/fs.h        | 8 ++++++++
 2 files changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index 7e9016a9e193..e2cf0dd43203 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -4,6 +4,7 @@
 #define BTRFS_BLOCK_RSV_H
 
 struct btrfs_trans_handle;
+struct btrfs_root;
 enum btrfs_reserve_flush_enum;
 
 /*
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index d8c931014c2a..a49d11127bf7 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -3,6 +3,14 @@
 #ifndef BTRFS_FS_H
 #define BTRFS_FS_H
 
+#include <linux/fs.h>
+#include <linux/btrfs_tree.h>
+#include <linux/sizes.h>
+#include "extent-io-tree.h"
+#include "extent_map.h"
+#include "async-thread.h"
+#include "block-rsv.h"
+
 #define BTRFS_MAX_EXTENT_SIZE SZ_128M
 
 #define BTRFS_OLDEST_GENERATION	0ULL
-- 
cgit 


From 5034388342564877879e2a94faf72eff468a7437 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 26 Oct 2022 15:08:19 -0400
Subject: btrfs: add blk_types.h include to compression.h

When moving the printk messages into their own file I got a compiler
error because the includes grabbed compression.h, but nothing pulled in
the blk_types.h dependency that compression.h has because it uses
blkstatus_t.  Add blk_types.h to compression.h so that this sort of
thing doesn't happen in the future.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 9da2343eeff5..b961462399ae 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -6,6 +6,7 @@
 #ifndef BTRFS_COMPRESSION_H
 #define BTRFS_COMPRESSION_H
 
+#include <linux/blk_types.h>
 #include <linux/sizes.h>
 
 struct btrfs_inode;
-- 
cgit 


From 083bd7e54e8e2c1d169f21236873272cdf6f409a Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 26 Oct 2022 15:08:20 -0400
Subject: btrfs: move the printk and assert helpers to messages.c

These helpers are core to btrfs, and in order to more easily sync
various parts of the btrfs kernel code into btrfs-progs we need to be
able to carry these helpers with us.  However we want to have our own
implementation for the helpers themselves, currently they're implemented
in different files that we want to sync inside of btrfs-progs itself.
Move these into their own C file, this will allow us to contain our
overrides in btrfs-progs in it's own file without messing with the rest
of the codebase.

In copying things over I fixed up a few whitespace errors that already
existed.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/Makefile   |   2 +-
 fs/btrfs/messages.c | 352 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/super.c    | 346 ---------------------------------------------------
 3 files changed, 353 insertions(+), 347 deletions(-)
 create mode 100644 fs/btrfs/messages.c

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 76f90dcfb14d..bb170c9d030d 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -31,7 +31,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
 	   uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
 	   block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
-	   subpage.o tree-mod-log.o extent-io-tree.o fs.o
+	   subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c
new file mode 100644
index 000000000000..196757ee16f1
--- /dev/null
+++ b/fs/btrfs/messages.c
@@ -0,0 +1,352 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "fs.h"
+#include "messages.h"
+#include "discard.h"
+#include "transaction.h"
+#include "space-info.h"
+
+#ifdef CONFIG_PRINTK
+
+#define STATE_STRING_PREFACE	": state "
+#define STATE_STRING_BUF_LEN	(sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT)
+
+/*
+ * Characters to print to indicate error conditions or uncommon filesystem state.
+ * RO is not an error.
+ */
+static const char fs_state_chars[] = {
+	[BTRFS_FS_STATE_ERROR]			= 'E',
+	[BTRFS_FS_STATE_REMOUNTING]		= 'M',
+	[BTRFS_FS_STATE_RO]			= 0,
+	[BTRFS_FS_STATE_TRANS_ABORTED]		= 'A',
+	[BTRFS_FS_STATE_DEV_REPLACING]		= 'R',
+	[BTRFS_FS_STATE_DUMMY_FS_INFO]		= 0,
+	[BTRFS_FS_STATE_NO_CSUMS]		= 'C',
+	[BTRFS_FS_STATE_LOG_CLEANUP_ERROR]	= 'L',
+};
+
+static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf)
+{
+	unsigned int bit;
+	bool states_printed = false;
+	unsigned long fs_state = READ_ONCE(info->fs_state);
+	char *curr = buf;
+
+	memcpy(curr, STATE_STRING_PREFACE, sizeof(STATE_STRING_PREFACE));
+	curr += sizeof(STATE_STRING_PREFACE) - 1;
+
+	for_each_set_bit(bit, &fs_state, sizeof(fs_state)) {
+		WARN_ON_ONCE(bit >= BTRFS_FS_STATE_COUNT);
+		if ((bit < BTRFS_FS_STATE_COUNT) && fs_state_chars[bit]) {
+			*curr++ = fs_state_chars[bit];
+			states_printed = true;
+		}
+	}
+
+	/* If no states were printed, reset the buffer */
+	if (!states_printed)
+		curr = buf;
+
+	*curr++ = 0;
+}
+#endif
+
+/*
+ * Generally the error codes correspond to their respective errors, but there
+ * are a few special cases.
+ *
+ * EUCLEAN: Any sort of corruption that we encounter.  The tree-checker for
+ *          instance will return EUCLEAN if any of the blocks are corrupted in
+ *          a way that is problematic.  We want to reserve EUCLEAN for these
+ *          sort of corruptions.
+ *
+ * EROFS: If we check BTRFS_FS_STATE_ERROR and fail out with a return error, we
+ *        need to use EROFS for this case.  We will have no idea of the
+ *        original failure, that will have been reported at the time we tripped
+ *        over the error.  Each subsequent error that doesn't have any context
+ *        of the original error should use EROFS when handling BTRFS_FS_STATE_ERROR.
+ */
+const char * __attribute_const__ btrfs_decode_error(int errno)
+{
+	char *errstr = "unknown";
+
+	switch (errno) {
+	case -ENOENT:		/* -2 */
+		errstr = "No such entry";
+		break;
+	case -EIO:		/* -5 */
+		errstr = "IO failure";
+		break;
+	case -ENOMEM:		/* -12*/
+		errstr = "Out of memory";
+		break;
+	case -EEXIST:		/* -17 */
+		errstr = "Object already exists";
+		break;
+	case -ENOSPC:		/* -28 */
+		errstr = "No space left";
+		break;
+	case -EROFS:		/* -30 */
+		errstr = "Readonly filesystem";
+		break;
+	case -EOPNOTSUPP:	/* -95 */
+		errstr = "Operation not supported";
+		break;
+	case -EUCLEAN:		/* -117 */
+		errstr = "Filesystem corrupted";
+		break;
+	case -EDQUOT:		/* -122 */
+		errstr = "Quota exceeded";
+		break;
+	}
+
+	return errstr;
+}
+
+/*
+ * __btrfs_handle_fs_error decodes expected errors from the caller and
+ * invokes the appropriate error response.
+ */
+__cold
+void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
+		       unsigned int line, int errno, const char *fmt, ...)
+{
+	struct super_block *sb = fs_info->sb;
+#ifdef CONFIG_PRINTK
+	char statestr[STATE_STRING_BUF_LEN];
+	const char *errstr;
+#endif
+
+#ifdef CONFIG_PRINTK_INDEX
+	printk_index_subsys_emit(
+		"BTRFS: error (device %s%s) in %s:%d: errno=%d %s", KERN_CRIT, fmt);
+#endif
+
+	/*
+	 * Special case: if the error is EROFS, and we're already under
+	 * SB_RDONLY, then it is safe here.
+	 */
+	if (errno == -EROFS && sb_rdonly(sb))
+		return;
+
+#ifdef CONFIG_PRINTK
+	errstr = btrfs_decode_error(errno);
+	btrfs_state_to_string(fs_info, statestr);
+	if (fmt) {
+		struct va_format vaf;
+		va_list args;
+
+		va_start(args, fmt);
+		vaf.fmt = fmt;
+		vaf.va = &args;
+
+		pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n",
+			sb->s_id, statestr, function, line, errno, errstr, &vaf);
+		va_end(args);
+	} else {
+		pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n",
+			sb->s_id, statestr, function, line, errno, errstr);
+	}
+#endif
+
+	/*
+	 * Today we only save the error info to memory.  Long term we'll also
+	 * send it down to the disk.
+	 */
+	set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
+
+	/* Don't go through full error handling during mount. */
+	if (!(sb->s_flags & SB_BORN))
+		return;
+
+	if (sb_rdonly(sb))
+		return;
+
+	btrfs_discard_stop(fs_info);
+
+	/* Handle error by forcing the filesystem readonly. */
+	btrfs_set_sb_rdonly(sb);
+	btrfs_info(fs_info, "forced readonly");
+	/*
+	 * Note that a running device replace operation is not canceled here
+	 * although there is no way to update the progress. It would add the
+	 * risk of a deadlock, therefore the canceling is omitted. The only
+	 * penalty is that some I/O remains active until the procedure
+	 * completes. The next time when the filesystem is mounted writable
+	 * again, the device replace operation continues.
+	 */
+}
+
+#ifdef CONFIG_PRINTK
+static const char * const logtypes[] = {
+	"emergency",
+	"alert",
+	"critical",
+	"error",
+	"warning",
+	"notice",
+	"info",
+	"debug",
+};
+
+/*
+ * Use one ratelimit state per log level so that a flood of less important
+ * messages doesn't cause more important ones to be dropped.
+ */
+static struct ratelimit_state printk_limits[] = {
+	RATELIMIT_STATE_INIT(printk_limits[0], DEFAULT_RATELIMIT_INTERVAL, 100),
+	RATELIMIT_STATE_INIT(printk_limits[1], DEFAULT_RATELIMIT_INTERVAL, 100),
+	RATELIMIT_STATE_INIT(printk_limits[2], DEFAULT_RATELIMIT_INTERVAL, 100),
+	RATELIMIT_STATE_INIT(printk_limits[3], DEFAULT_RATELIMIT_INTERVAL, 100),
+	RATELIMIT_STATE_INIT(printk_limits[4], DEFAULT_RATELIMIT_INTERVAL, 100),
+	RATELIMIT_STATE_INIT(printk_limits[5], DEFAULT_RATELIMIT_INTERVAL, 100),
+	RATELIMIT_STATE_INIT(printk_limits[6], DEFAULT_RATELIMIT_INTERVAL, 100),
+	RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100),
+};
+
+void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
+{
+	char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0";
+	struct va_format vaf;
+	va_list args;
+	int kern_level;
+	const char *type = logtypes[4];
+	struct ratelimit_state *ratelimit = &printk_limits[4];
+
+#ifdef CONFIG_PRINTK_INDEX
+	printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt);
+#endif
+
+	va_start(args, fmt);
+
+	while ((kern_level = printk_get_level(fmt)) != 0) {
+		size_t size = printk_skip_level(fmt) - fmt;
+
+		if (kern_level >= '0' && kern_level <= '7') {
+			memcpy(lvl, fmt,  size);
+			lvl[size] = '\0';
+			type = logtypes[kern_level - '0'];
+			ratelimit = &printk_limits[kern_level - '0'];
+		}
+		fmt += size;
+	}
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	if (__ratelimit(ratelimit)) {
+		if (fs_info) {
+			char statestr[STATE_STRING_BUF_LEN];
+
+			btrfs_state_to_string(fs_info, statestr);
+			_printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type,
+				fs_info->sb->s_id, statestr, &vaf);
+		} else {
+			_printk("%sBTRFS %s: %pV\n", lvl, type, &vaf);
+		}
+	}
+
+	va_end(args);
+}
+#endif
+
+#ifdef CONFIG_BTRFS_ASSERT
+void __cold btrfs_assertfail(const char *expr, const char *file, int line)
+{
+	pr_err("assertion failed: %s, in %s:%d\n", expr, file, line);
+	BUG();
+}
+#endif
+
+void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info)
+{
+	btrfs_err(fs_info,
+"Unsupported V0 extent filesystem detected. Aborting. Please re-create your filesystem with a newer kernel");
+}
+
+#if BITS_PER_LONG == 32
+void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info)
+{
+	if (!test_and_set_bit(BTRFS_FS_32BIT_WARN, &fs_info->flags)) {
+		btrfs_warn(fs_info, "reaching 32bit limit for logical addresses");
+		btrfs_warn(fs_info,
+"due to page cache limit on 32bit systems, btrfs can't access metadata at or beyond %lluT",
+			   BTRFS_32BIT_MAX_FILE_SIZE >> 40);
+		btrfs_warn(fs_info,
+			   "please consider upgrading to 64bit kernel/hardware");
+	}
+}
+
+void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info)
+{
+	if (!test_and_set_bit(BTRFS_FS_32BIT_ERROR, &fs_info->flags)) {
+		btrfs_err(fs_info, "reached 32bit limit for logical addresses");
+		btrfs_err(fs_info,
+"due to page cache limit on 32bit systems, metadata beyond %lluT can't be accessed",
+			  BTRFS_32BIT_MAX_FILE_SIZE >> 40);
+		btrfs_err(fs_info,
+			   "please consider upgrading to 64bit kernel/hardware");
+	}
+}
+#endif
+
+/*
+ * We only mark the transaction aborted and then set the file system read-only.
+ * This will prevent new transactions from starting or trying to join this
+ * one.
+ *
+ * This means that error recovery at the call site is limited to freeing
+ * any local memory allocations and passing the error code up without
+ * further cleanup. The transaction should complete as it normally would
+ * in the call path but will return -EIO.
+ *
+ * We'll complete the cleanup in btrfs_end_transaction and
+ * btrfs_commit_transaction.
+ */
+__cold
+void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
+			       const char *function,
+			       unsigned int line, int errno, bool first_hit)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+
+	WRITE_ONCE(trans->aborted, errno);
+	WRITE_ONCE(trans->transaction->aborted, errno);
+	if (first_hit && errno == -ENOSPC)
+		btrfs_dump_space_info_for_trans_abort(fs_info);
+	/* Wake up anybody who may be waiting on this transaction */
+	wake_up(&fs_info->transaction_wait);
+	wake_up(&fs_info->transaction_blocked_wait);
+	__btrfs_handle_fs_error(fs_info, function, line, errno, NULL);
+}
+
+/*
+ * __btrfs_panic decodes unexpected, fatal errors from the caller, issues an
+ * alert, and either panics or BUGs, depending on mount options.
+ */
+__cold
+void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+		   unsigned int line, int errno, const char *fmt, ...)
+{
+	char *s_id = "<unknown>";
+	const char *errstr;
+	struct va_format vaf = { .fmt = fmt };
+	va_list args;
+
+	if (fs_info)
+		s_id = fs_info->sb->s_id;
+
+	va_start(args, fmt);
+	vaf.va = &args;
+
+	errstr = btrfs_decode_error(errno);
+	if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR)))
+		panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
+			s_id, function, line, &vaf, errno, errstr);
+
+	btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)",
+		   function, line, &vaf, errno, errstr);
+	va_end(args);
+	/* Caller calls BUG() */
+}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 33bf211699f8..a4030dfeb2f2 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -70,352 +70,6 @@ static struct file_system_type btrfs_root_fs_type;
 
 static int btrfs_remount(struct super_block *sb, int *flags, char *data);
 
-#ifdef CONFIG_PRINTK
-
-#define STATE_STRING_PREFACE	": state "
-#define STATE_STRING_BUF_LEN	(sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT)
-
-/*
- * Characters to print to indicate error conditions or uncommon filesystem state.
- * RO is not an error.
- */
-static const char fs_state_chars[] = {
-	[BTRFS_FS_STATE_ERROR]			= 'E',
-	[BTRFS_FS_STATE_REMOUNTING]		= 'M',
-	[BTRFS_FS_STATE_RO]			= 0,
-	[BTRFS_FS_STATE_TRANS_ABORTED]		= 'A',
-	[BTRFS_FS_STATE_DEV_REPLACING]		= 'R',
-	[BTRFS_FS_STATE_DUMMY_FS_INFO]		= 0,
-	[BTRFS_FS_STATE_NO_CSUMS]		= 'C',
-	[BTRFS_FS_STATE_LOG_CLEANUP_ERROR]	= 'L',
-};
-
-static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf)
-{
-	unsigned int bit;
-	bool states_printed = false;
-	unsigned long fs_state = READ_ONCE(info->fs_state);
-	char *curr = buf;
-
-	memcpy(curr, STATE_STRING_PREFACE, sizeof(STATE_STRING_PREFACE));
-	curr += sizeof(STATE_STRING_PREFACE) - 1;
-
-	for_each_set_bit(bit, &fs_state, sizeof(fs_state)) {
-		WARN_ON_ONCE(bit >= BTRFS_FS_STATE_COUNT);
-		if ((bit < BTRFS_FS_STATE_COUNT) && fs_state_chars[bit]) {
-			*curr++ = fs_state_chars[bit];
-			states_printed = true;
-		}
-	}
-
-	/* If no states were printed, reset the buffer */
-	if (!states_printed)
-		curr = buf;
-
-	*curr++ = 0;
-}
-#endif
-
-/*
- * Generally the error codes correspond to their respective errors, but there
- * are a few special cases.
- *
- * EUCLEAN: Any sort of corruption that we encounter.  The tree-checker for
- *          instance will return EUCLEAN if any of the blocks are corrupted in
- *          a way that is problematic.  We want to reserve EUCLEAN for these
- *          sort of corruptions.
- *
- * EROFS: If we check BTRFS_FS_STATE_ERROR and fail out with a return error, we
- *        need to use EROFS for this case.  We will have no idea of the
- *        original failure, that will have been reported at the time we tripped
- *        over the error.  Each subsequent error that doesn't have any context
- *        of the original error should use EROFS when handling BTRFS_FS_STATE_ERROR.
- */
-const char * __attribute_const__ btrfs_decode_error(int errno)
-{
-	char *errstr = "unknown";
-
-	switch (errno) {
-	case -ENOENT:		/* -2 */
-		errstr = "No such entry";
-		break;
-	case -EIO:		/* -5 */
-		errstr = "IO failure";
-		break;
-	case -ENOMEM:		/* -12*/
-		errstr = "Out of memory";
-		break;
-	case -EEXIST:		/* -17 */
-		errstr = "Object already exists";
-		break;
-	case -ENOSPC:		/* -28 */
-		errstr = "No space left";
-		break;
-	case -EROFS:		/* -30 */
-		errstr = "Readonly filesystem";
-		break;
-	case -EOPNOTSUPP:	/* -95 */
-		errstr = "Operation not supported";
-		break;
-	case -EUCLEAN:		/* -117 */
-		errstr = "Filesystem corrupted";
-		break;
-	case -EDQUOT:		/* -122 */
-		errstr = "Quota exceeded";
-		break;
-	}
-
-	return errstr;
-}
-
-/*
- * __btrfs_handle_fs_error decodes expected errors from the caller and
- * invokes the appropriate error response.
- */
-__cold
-void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
-		       unsigned int line, int errno, const char *fmt, ...)
-{
-	struct super_block *sb = fs_info->sb;
-#ifdef CONFIG_PRINTK
-	char statestr[STATE_STRING_BUF_LEN];
-	const char *errstr;
-#endif
-
-#ifdef CONFIG_PRINTK_INDEX
-	printk_index_subsys_emit(
-		"BTRFS: error (device %s%s) in %s:%d: errno=%d %s",
-		KERN_CRIT, fmt);
-#endif
-
-	/*
-	 * Special case: if the error is EROFS, and we're already
-	 * under SB_RDONLY, then it is safe here.
-	 */
-	if (errno == -EROFS && sb_rdonly(sb))
-  		return;
-
-#ifdef CONFIG_PRINTK
-	errstr = btrfs_decode_error(errno);
-	btrfs_state_to_string(fs_info, statestr);
-	if (fmt) {
-		struct va_format vaf;
-		va_list args;
-
-		va_start(args, fmt);
-		vaf.fmt = fmt;
-		vaf.va = &args;
-
-		pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n",
-			sb->s_id, statestr, function, line, errno, errstr, &vaf);
-		va_end(args);
-	} else {
-		pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n",
-			sb->s_id, statestr, function, line, errno, errstr);
-	}
-#endif
-
-	/*
-	 * Today we only save the error info to memory.  Long term we'll
-	 * also send it down to the disk
-	 */
-	set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
-
-	/* Don't go through full error handling during mount */
-	if (!(sb->s_flags & SB_BORN))
-		return;
-
-	if (sb_rdonly(sb))
-		return;
-
-	btrfs_discard_stop(fs_info);
-
-	/* btrfs handle error by forcing the filesystem readonly */
-	btrfs_set_sb_rdonly(sb);
-	btrfs_info(fs_info, "forced readonly");
-	/*
-	 * Note that a running device replace operation is not canceled here
-	 * although there is no way to update the progress. It would add the
-	 * risk of a deadlock, therefore the canceling is omitted. The only
-	 * penalty is that some I/O remains active until the procedure
-	 * completes. The next time when the filesystem is mounted writable
-	 * again, the device replace operation continues.
-	 */
-}
-
-#ifdef CONFIG_PRINTK
-static const char * const logtypes[] = {
-	"emergency",
-	"alert",
-	"critical",
-	"error",
-	"warning",
-	"notice",
-	"info",
-	"debug",
-};
-
-
-/*
- * Use one ratelimit state per log level so that a flood of less important
- * messages doesn't cause more important ones to be dropped.
- */
-static struct ratelimit_state printk_limits[] = {
-	RATELIMIT_STATE_INIT(printk_limits[0], DEFAULT_RATELIMIT_INTERVAL, 100),
-	RATELIMIT_STATE_INIT(printk_limits[1], DEFAULT_RATELIMIT_INTERVAL, 100),
-	RATELIMIT_STATE_INIT(printk_limits[2], DEFAULT_RATELIMIT_INTERVAL, 100),
-	RATELIMIT_STATE_INIT(printk_limits[3], DEFAULT_RATELIMIT_INTERVAL, 100),
-	RATELIMIT_STATE_INIT(printk_limits[4], DEFAULT_RATELIMIT_INTERVAL, 100),
-	RATELIMIT_STATE_INIT(printk_limits[5], DEFAULT_RATELIMIT_INTERVAL, 100),
-	RATELIMIT_STATE_INIT(printk_limits[6], DEFAULT_RATELIMIT_INTERVAL, 100),
-	RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100),
-};
-
-void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
-{
-	char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0";
-	struct va_format vaf;
-	va_list args;
-	int kern_level;
-	const char *type = logtypes[4];
-	struct ratelimit_state *ratelimit = &printk_limits[4];
-
-#ifdef CONFIG_PRINTK_INDEX
-	printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt);
-#endif
-
-	va_start(args, fmt);
-
-	while ((kern_level = printk_get_level(fmt)) != 0) {
-		size_t size = printk_skip_level(fmt) - fmt;
-
-		if (kern_level >= '0' && kern_level <= '7') {
-			memcpy(lvl, fmt,  size);
-			lvl[size] = '\0';
-			type = logtypes[kern_level - '0'];
-			ratelimit = &printk_limits[kern_level - '0'];
-		}
-		fmt += size;
-	}
-
-	vaf.fmt = fmt;
-	vaf.va = &args;
-
-	if (__ratelimit(ratelimit)) {
-		if (fs_info) {
-			char statestr[STATE_STRING_BUF_LEN];
-
-			btrfs_state_to_string(fs_info, statestr);
-			_printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type,
-				fs_info->sb->s_id, statestr, &vaf);
-		} else {
-			_printk("%sBTRFS %s: %pV\n", lvl, type, &vaf);
-		}
-	}
-
-	va_end(args);
-}
-#endif
-
-#ifdef CONFIG_BTRFS_ASSERT
-void __cold btrfs_assertfail(const char *expr, const char *file, int line)
-{
-	pr_err("assertion failed: %s, in %s:%d\n", expr, file, line);
-	BUG();
-}
-#endif
-
-void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info)
-{
-	btrfs_err(fs_info,
-"Unsupported V0 extent filesystem detected. Aborting. Please re-create your filesystem with a newer kernel");
-}
-
-#if BITS_PER_LONG == 32
-void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info)
-{
-	if (!test_and_set_bit(BTRFS_FS_32BIT_WARN, &fs_info->flags)) {
-		btrfs_warn(fs_info, "reaching 32bit limit for logical addresses");
-		btrfs_warn(fs_info,
-"due to page cache limit on 32bit systems, btrfs can't access metadata at or beyond %lluT",
-			   BTRFS_32BIT_MAX_FILE_SIZE >> 40);
-		btrfs_warn(fs_info,
-			   "please consider upgrading to 64bit kernel/hardware");
-	}
-}
-
-void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info)
-{
-	if (!test_and_set_bit(BTRFS_FS_32BIT_ERROR, &fs_info->flags)) {
-		btrfs_err(fs_info, "reached 32bit limit for logical addresses");
-		btrfs_err(fs_info,
-"due to page cache limit on 32bit systems, metadata beyond %lluT can't be accessed",
-			  BTRFS_32BIT_MAX_FILE_SIZE >> 40);
-		btrfs_err(fs_info,
-			   "please consider upgrading to 64bit kernel/hardware");
-	}
-}
-#endif
-
-/*
- * We only mark the transaction aborted and then set the file system read-only.
- * This will prevent new transactions from starting or trying to join this
- * one.
- *
- * This means that error recovery at the call site is limited to freeing
- * any local memory allocations and passing the error code up without
- * further cleanup. The transaction should complete as it normally would
- * in the call path but will return -EIO.
- *
- * We'll complete the cleanup in btrfs_end_transaction and
- * btrfs_commit_transaction.
- */
-__cold
-void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
-			       const char *function,
-			       unsigned int line, int errno, bool first_hit)
-{
-	struct btrfs_fs_info *fs_info = trans->fs_info;
-
-	WRITE_ONCE(trans->aborted, errno);
-	WRITE_ONCE(trans->transaction->aborted, errno);
-	if (first_hit && errno == -ENOSPC)
-		btrfs_dump_space_info_for_trans_abort(fs_info);
-	/* Wake up anybody who may be waiting on this transaction */
-	wake_up(&fs_info->transaction_wait);
-	wake_up(&fs_info->transaction_blocked_wait);
-	__btrfs_handle_fs_error(fs_info, function, line, errno, NULL);
-}
-/*
- * __btrfs_panic decodes unexpected, fatal errors from the caller,
- * issues an alert, and either panics or BUGs, depending on mount options.
- */
-__cold
-void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
-		   unsigned int line, int errno, const char *fmt, ...)
-{
-	char *s_id = "<unknown>";
-	const char *errstr;
-	struct va_format vaf = { .fmt = fmt };
-	va_list args;
-
-	if (fs_info)
-		s_id = fs_info->sb->s_id;
-
-	va_start(args, fmt);
-	vaf.va = &args;
-
-	errstr = btrfs_decode_error(errno);
-	if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR)))
-		panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
-			s_id, function, line, &vaf, errno, errstr);
-
-	btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)",
-		   function, line, &vaf, errno, errstr);
-	va_end(args);
-	/* Caller calls BUG() */
-}
-
 static void btrfs_put_super(struct super_block *sb)
 {
 	close_ctree(btrfs_sb(sb));
-- 
cgit 


From 2885fd632050ef47317a97a3a68bc98634a6e11e Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 26 Oct 2022 15:08:21 -0400
Subject: btrfs: move inode prototypes to btrfs_inode.h

I initially wanted to make a new header file for this, but these
prototypes do naturally fit into btrfs_inode.h.  If we want to extract
vfs from pure btrfs code in the future we may need to split this up, but
btrfs_inode embeds the vfs_inode, so it makes sense to put the
prototypes in this header for now.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h | 136 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/ctree.h       | 140 -------------------------------------------------
 2 files changed, 136 insertions(+), 140 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 530a0ebfab3f..d21c30bf7053 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -410,4 +410,140 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags,
 /* Array of bytes with variable length, hexadecimal format 0x1234 */
 #define CSUM_FMT				"0x%*phN"
 #define CSUM_FMT_VALUE(size, bytes)		size, bytes
+
+void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num);
+void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
+			int mirror_num, enum btrfs_compression_type compress_type);
+int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
+			    u32 pgoff, u8 *csum, const u8 * const csum_expected);
+int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
+			  u32 bio_offset, struct page *page, u32 pgoff);
+unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
+				    u32 bio_offset, struct page *page,
+				    u64 start, u64 end);
+int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
+			  u32 bio_offset, struct page *page, u32 pgoff);
+noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
+			      u64 *orig_start, u64 *orig_block_len,
+			      u64 *ram_bytes, bool nowait, bool strict);
+
+void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct btrfs_inode *inode);
+struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
+int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index);
+int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+		       struct btrfs_inode *dir, struct btrfs_inode *inode,
+		       const struct fscrypt_str *name);
+int btrfs_add_link(struct btrfs_trans_handle *trans,
+		   struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
+		   const struct fscrypt_str *name, int add_backref, u64 index);
+int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry);
+int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
+			 int front);
+
+int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context);
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
+			       bool in_reclaim_context);
+int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
+			      unsigned int extra_bits,
+			      struct extent_state **cached_state);
+
+struct btrfs_new_inode_args {
+	/* Input */
+	struct inode *dir;
+	struct dentry *dentry;
+	struct inode *inode;
+	bool orphan;
+	bool subvol;
+
+	/* Output from btrfs_new_inode_prepare(), input to btrfs_create_new_inode(). */
+	struct posix_acl *default_acl;
+	struct posix_acl *acl;
+	struct fscrypt_name fname;
+};
+
+int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
+			    unsigned int *trans_num_items);
+int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
+			   struct btrfs_new_inode_args *args);
+void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args);
+struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns,
+				     struct inode *dir);
+ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
+			        u32 bits);
+void btrfs_clear_delalloc_extent(struct inode *inode,
+				 struct extent_state *state, u32 bits);
+void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
+				 struct extent_state *other);
+void btrfs_split_delalloc_extent(struct inode *inode,
+				 struct extent_state *orig, u64 split);
+void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
+vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
+void btrfs_evict_inode(struct inode *inode);
+struct inode *btrfs_alloc_inode(struct super_block *sb);
+void btrfs_destroy_inode(struct inode *inode);
+void btrfs_free_inode(struct inode *inode);
+int btrfs_drop_inode(struct inode *inode);
+int __init btrfs_init_cachep(void);
+void __cold btrfs_destroy_cachep(void);
+struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
+			      struct btrfs_root *root, struct btrfs_path *path);
+struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root);
+struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
+				    struct page *page, size_t pg_offset,
+				    u64 start, u64 end);
+int btrfs_update_inode(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, struct btrfs_inode *inode);
+int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, struct btrfs_inode *inode);
+int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode);
+int btrfs_orphan_cleanup(struct btrfs_root *root);
+int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size);
+void btrfs_add_delayed_iput(struct inode *inode);
+void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info);
+int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info);
+int btrfs_prealloc_file_range(struct inode *inode, int mode,
+			      u64 start, u64 num_bytes, u64 min_size,
+			      loff_t actual_len, u64 *alloc_hint);
+int btrfs_prealloc_file_range_trans(struct inode *inode,
+				    struct btrfs_trans_handle *trans, int mode,
+				    u64 start, u64 num_bytes, u64 min_size,
+				    loff_t actual_len, u64 *alloc_hint);
+int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
+			     u64 start, u64 end, int *page_started,
+			     unsigned long *nr_written, struct writeback_control *wbc);
+int btrfs_writepage_cow_fixup(struct page *page);
+void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
+					  struct page *page, u64 start,
+					  u64 end, bool uptodate);
+int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
+					     int compress_type);
+int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
+					  u64 file_offset, u64 disk_bytenr,
+					  u64 disk_io_size,
+					  struct page **pages);
+ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
+			   struct btrfs_ioctl_encoded_io_args *encoded);
+ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
+			       const struct btrfs_ioctl_encoded_io_args *encoded);
+
+ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
+		       size_t done_before);
+struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
+				  size_t done_before);
+
+extern const struct dentry_operations btrfs_dentry_operations;
+
+/* Inode locking type flags, by default the exclusive lock is taken. */
+enum btrfs_ilock_type {
+	ENUM_BIT(BTRFS_ILOCK_SHARED),
+	ENUM_BIT(BTRFS_ILOCK_TRY),
+	ENUM_BIT(BTRFS_ILOCK_MMAP),
+};
+
+int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags);
+void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags);
+void btrfs_update_inode_bytes(struct btrfs_inode *inode, const u64 add_bytes,
+			      const u64 del_bytes);
+void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end);
+
 #endif
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1e0944c37547..a9267b117860 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -764,146 +764,6 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
 void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size);
 u64 btrfs_file_extent_end(const struct btrfs_path *path);
 
-/* inode.c */
-void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num);
-void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
-			int mirror_num, enum btrfs_compression_type compress_type);
-int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
-			    u32 pgoff, u8 *csum, const u8 * const csum_expected);
-int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
-			  u32 bio_offset, struct page *page, u32 pgoff);
-unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
-				    u32 bio_offset, struct page *page,
-				    u64 start, u64 end);
-int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
-			  u32 bio_offset, struct page *page, u32 pgoff);
-noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
-			      u64 *orig_start, u64 *orig_block_len,
-			      u64 *ram_bytes, bool nowait, bool strict);
-
-void __btrfs_del_delalloc_inode(struct btrfs_root *root,
-				struct btrfs_inode *inode);
-struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
-int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index);
-int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
-		       struct btrfs_inode *dir, struct btrfs_inode *inode,
-		       const struct fscrypt_str *name);
-int btrfs_add_link(struct btrfs_trans_handle *trans,
-		   struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
-		   const struct fscrypt_str *name, int add_backref, u64 index);
-int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry);
-int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
-			 int front);
-
-int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context);
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
-			       bool in_reclaim_context);
-int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
-			      unsigned int extra_bits,
-			      struct extent_state **cached_state);
-struct btrfs_new_inode_args {
-	/* Input */
-	struct inode *dir;
-	struct dentry *dentry;
-	struct inode *inode;
-	bool orphan;
-	bool subvol;
-
-	/*
-	 * Output from btrfs_new_inode_prepare(), input to
-	 * btrfs_create_new_inode().
-	 */
-	struct posix_acl *default_acl;
-	struct posix_acl *acl;
-	struct fscrypt_name fname;
-};
-int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
-			    unsigned int *trans_num_items);
-int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
-			   struct btrfs_new_inode_args *args);
-void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args);
-struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns,
-				     struct inode *dir);
- void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
-			        u32 bits);
-void btrfs_clear_delalloc_extent(struct inode *inode,
-				 struct extent_state *state, u32 bits);
-void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
-				 struct extent_state *other);
-void btrfs_split_delalloc_extent(struct inode *inode,
-				 struct extent_state *orig, u64 split);
-void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
-vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
-void btrfs_evict_inode(struct inode *inode);
-struct inode *btrfs_alloc_inode(struct super_block *sb);
-void btrfs_destroy_inode(struct inode *inode);
-void btrfs_free_inode(struct inode *inode);
-int btrfs_drop_inode(struct inode *inode);
-int __init btrfs_init_cachep(void);
-void __cold btrfs_destroy_cachep(void);
-struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
-			      struct btrfs_root *root, struct btrfs_path *path);
-struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root);
-struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
-				    struct page *page, size_t pg_offset,
-				    u64 start, u64 end);
-int btrfs_update_inode(struct btrfs_trans_handle *trans,
-		       struct btrfs_root *root, struct btrfs_inode *inode);
-int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root, struct btrfs_inode *inode);
-int btrfs_orphan_add(struct btrfs_trans_handle *trans,
-		struct btrfs_inode *inode);
-int btrfs_orphan_cleanup(struct btrfs_root *root);
-int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size);
-void btrfs_add_delayed_iput(struct inode *inode);
-void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info);
-int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info);
-int btrfs_prealloc_file_range(struct inode *inode, int mode,
-			      u64 start, u64 num_bytes, u64 min_size,
-			      loff_t actual_len, u64 *alloc_hint);
-int btrfs_prealloc_file_range_trans(struct inode *inode,
-				    struct btrfs_trans_handle *trans, int mode,
-				    u64 start, u64 num_bytes, u64 min_size,
-				    loff_t actual_len, u64 *alloc_hint);
-int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
-		u64 start, u64 end, int *page_started, unsigned long *nr_written,
-		struct writeback_control *wbc);
-int btrfs_writepage_cow_fixup(struct page *page);
-void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
-					  struct page *page, u64 start,
-					  u64 end, bool uptodate);
-int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
-					     int compress_type);
-int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
-					  u64 file_offset, u64 disk_bytenr,
-					  u64 disk_io_size,
-					  struct page **pages);
-ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
-			   struct btrfs_ioctl_encoded_io_args *encoded);
-ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
-			     const struct btrfs_ioctl_encoded_io_args *encoded);
-
-ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
-		       size_t done_before);
-struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
-				  size_t done_before);
-
-extern const struct dentry_operations btrfs_dentry_operations;
-
-/* Inode locking type flags, by default the exclusive lock is taken */
-enum btrfs_ilock_type {
-	ENUM_BIT(BTRFS_ILOCK_SHARED),
-	ENUM_BIT(BTRFS_ILOCK_TRY),
-	ENUM_BIT(BTRFS_ILOCK_MMAP),
-};
-
-int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags);
-void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags);
-void btrfs_update_inode_bytes(struct btrfs_inode *inode,
-			      const u64 add_bytes,
-			      const u64 del_bytes);
-void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end);
-
 /* ioctl.c */
 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
-- 
cgit 


From 778dd695dd4d5a21eff07bb1570b570da69dfbd9 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 26 Oct 2022 15:08:22 -0400
Subject: btrfs: rename tree-defrag.c to defrag.c

This currently has only one helper in it, and it's for tree based
defrag.  We have the various defrag code in 3 different places, so
rename this to defrag.c.  Followup patches will move the code into this
new file.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/Makefile      |   2 +-
 fs/btrfs/defrag.c      | 133 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/tree-defrag.c | 133 -------------------------------------------------
 3 files changed, 134 insertions(+), 134 deletions(-)
 create mode 100644 fs/btrfs/defrag.c
 delete mode 100644 fs/btrfs/tree-defrag.c

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index bb170c9d030d..84fb3b4c35b0 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -23,7 +23,7 @@ obj-$(CONFIG_BTRFS_FS) := btrfs.o
 
 btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   file-item.o inode-item.o disk-io.o \
-	   transaction.o inode.o file.o tree-defrag.o \
+	   transaction.o inode.o file.o defrag.o \
 	   extent_map.o sysfs.o accessors.o xattr.o ordered-data.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
 	   export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
new file mode 100644
index 000000000000..5df604846de6
--- /dev/null
+++ b/fs/btrfs/defrag.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ */
+
+#include <linux/sched.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+#include "transaction.h"
+#include "locking.h"
+#include "accessors.h"
+
+/*
+ * Defrag all the leaves in a given btree.
+ * Read all the leaves and try to get key order to
+ * better reflect disk order
+ */
+
+int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root)
+{
+	struct btrfs_path *path = NULL;
+	struct btrfs_key key;
+	int ret = 0;
+	int wret;
+	int level;
+	int next_key_ret = 0;
+	u64 last_ret = 0;
+
+	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
+		goto out;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	level = btrfs_header_level(root->node);
+
+	if (level == 0)
+		goto out;
+
+	if (root->defrag_progress.objectid == 0) {
+		struct extent_buffer *root_node;
+		u32 nritems;
+
+		root_node = btrfs_lock_root_node(root);
+		nritems = btrfs_header_nritems(root_node);
+		root->defrag_max.objectid = 0;
+		/* from above we know this is not a leaf */
+		btrfs_node_key_to_cpu(root_node, &root->defrag_max,
+				      nritems - 1);
+		btrfs_tree_unlock(root_node);
+		free_extent_buffer(root_node);
+		memset(&key, 0, sizeof(key));
+	} else {
+		memcpy(&key, &root->defrag_progress, sizeof(key));
+	}
+
+	path->keep_locks = 1;
+
+	ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		ret = 0;
+		goto out;
+	}
+	btrfs_release_path(path);
+	/*
+	 * We don't need a lock on a leaf. btrfs_realloc_node() will lock all
+	 * leafs from path->nodes[1], so set lowest_level to 1 to avoid later
+	 * a deadlock (attempting to write lock an already write locked leaf).
+	 */
+	path->lowest_level = 1;
+	wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+
+	if (wret < 0) {
+		ret = wret;
+		goto out;
+	}
+	if (!path->nodes[1]) {
+		ret = 0;
+		goto out;
+	}
+	/*
+	 * The node at level 1 must always be locked when our path has
+	 * keep_locks set and lowest_level is 1, regardless of the value of
+	 * path->slots[1].
+	 */
+	BUG_ON(path->locks[1] == 0);
+	ret = btrfs_realloc_node(trans, root,
+				 path->nodes[1], 0,
+				 &last_ret,
+				 &root->defrag_progress);
+	if (ret) {
+		WARN_ON(ret == -EAGAIN);
+		goto out;
+	}
+	/*
+	 * Now that we reallocated the node we can find the next key. Note that
+	 * btrfs_find_next_key() can release our path and do another search
+	 * without COWing, this is because even with path->keep_locks = 1,
+	 * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a
+	 * node when path->slots[node_level - 1] does not point to the last
+	 * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore
+	 * we search for the next key after reallocating our node.
+	 */
+	path->slots[1] = btrfs_header_nritems(path->nodes[1]);
+	next_key_ret = btrfs_find_next_key(root, path, &key, 1,
+					   BTRFS_OLDEST_GENERATION);
+	if (next_key_ret == 0) {
+		memcpy(&root->defrag_progress, &key, sizeof(key));
+		ret = -EAGAIN;
+	}
+out:
+	btrfs_free_path(path);
+	if (ret == -EAGAIN) {
+		if (root->defrag_max.objectid > root->defrag_progress.objectid)
+			goto done;
+		if (root->defrag_max.type > root->defrag_progress.type)
+			goto done;
+		if (root->defrag_max.offset > root->defrag_progress.offset)
+			goto done;
+		ret = 0;
+	}
+done:
+	if (ret != -EAGAIN)
+		memset(&root->defrag_progress, 0,
+		       sizeof(root->defrag_progress));
+
+	return ret;
+}
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
deleted file mode 100644
index 5df604846de6..000000000000
--- a/fs/btrfs/tree-defrag.c
+++ /dev/null
@@ -1,133 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2007 Oracle.  All rights reserved.
- */
-
-#include <linux/sched.h>
-#include "ctree.h"
-#include "disk-io.h"
-#include "print-tree.h"
-#include "transaction.h"
-#include "locking.h"
-#include "accessors.h"
-
-/*
- * Defrag all the leaves in a given btree.
- * Read all the leaves and try to get key order to
- * better reflect disk order
- */
-
-int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root)
-{
-	struct btrfs_path *path = NULL;
-	struct btrfs_key key;
-	int ret = 0;
-	int wret;
-	int level;
-	int next_key_ret = 0;
-	u64 last_ret = 0;
-
-	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
-		goto out;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	level = btrfs_header_level(root->node);
-
-	if (level == 0)
-		goto out;
-
-	if (root->defrag_progress.objectid == 0) {
-		struct extent_buffer *root_node;
-		u32 nritems;
-
-		root_node = btrfs_lock_root_node(root);
-		nritems = btrfs_header_nritems(root_node);
-		root->defrag_max.objectid = 0;
-		/* from above we know this is not a leaf */
-		btrfs_node_key_to_cpu(root_node, &root->defrag_max,
-				      nritems - 1);
-		btrfs_tree_unlock(root_node);
-		free_extent_buffer(root_node);
-		memset(&key, 0, sizeof(key));
-	} else {
-		memcpy(&key, &root->defrag_progress, sizeof(key));
-	}
-
-	path->keep_locks = 1;
-
-	ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION);
-	if (ret < 0)
-		goto out;
-	if (ret > 0) {
-		ret = 0;
-		goto out;
-	}
-	btrfs_release_path(path);
-	/*
-	 * We don't need a lock on a leaf. btrfs_realloc_node() will lock all
-	 * leafs from path->nodes[1], so set lowest_level to 1 to avoid later
-	 * a deadlock (attempting to write lock an already write locked leaf).
-	 */
-	path->lowest_level = 1;
-	wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
-
-	if (wret < 0) {
-		ret = wret;
-		goto out;
-	}
-	if (!path->nodes[1]) {
-		ret = 0;
-		goto out;
-	}
-	/*
-	 * The node at level 1 must always be locked when our path has
-	 * keep_locks set and lowest_level is 1, regardless of the value of
-	 * path->slots[1].
-	 */
-	BUG_ON(path->locks[1] == 0);
-	ret = btrfs_realloc_node(trans, root,
-				 path->nodes[1], 0,
-				 &last_ret,
-				 &root->defrag_progress);
-	if (ret) {
-		WARN_ON(ret == -EAGAIN);
-		goto out;
-	}
-	/*
-	 * Now that we reallocated the node we can find the next key. Note that
-	 * btrfs_find_next_key() can release our path and do another search
-	 * without COWing, this is because even with path->keep_locks = 1,
-	 * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a
-	 * node when path->slots[node_level - 1] does not point to the last
-	 * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore
-	 * we search for the next key after reallocating our node.
-	 */
-	path->slots[1] = btrfs_header_nritems(path->nodes[1]);
-	next_key_ret = btrfs_find_next_key(root, path, &key, 1,
-					   BTRFS_OLDEST_GENERATION);
-	if (next_key_ret == 0) {
-		memcpy(&root->defrag_progress, &key, sizeof(key));
-		ret = -EAGAIN;
-	}
-out:
-	btrfs_free_path(path);
-	if (ret == -EAGAIN) {
-		if (root->defrag_max.objectid > root->defrag_progress.objectid)
-			goto done;
-		if (root->defrag_max.type > root->defrag_progress.type)
-			goto done;
-		if (root->defrag_max.offset > root->defrag_progress.offset)
-			goto done;
-		ret = 0;
-	}
-done:
-	if (ret != -EAGAIN)
-		memset(&root->defrag_progress, 0,
-		       sizeof(root->defrag_progress));
-
-	return ret;
-}
-- 
cgit 


From 6e3df18ba7e8e68015dd66bcab326a4b7aaed085 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 26 Oct 2022 15:08:23 -0400
Subject: btrfs: move the auto defrag code to defrag.c

This currently exists in file.c, move it to the more natural location in
defrag.c.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
[ reformat comments ]
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/defrag.c | 337 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/file.c   | 340 ------------------------------------------------------
 2 files changed, 337 insertions(+), 340 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index 5df604846de6..4c8c03185ce7 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -11,6 +11,326 @@
 #include "locking.h"
 #include "accessors.h"
 
+static struct kmem_cache *btrfs_inode_defrag_cachep;
+
+/*
+ * When auto defrag is enabled we queue up these defrag structs to remember
+ * which inodes need defragging passes.
+ */
+struct inode_defrag {
+	struct rb_node rb_node;
+	/* Inode number */
+	u64 ino;
+	/*
+	 * Transid where the defrag was added, we search for extents newer than
+	 * this.
+	 */
+	u64 transid;
+
+	/* Root objectid */
+	u64 root;
+
+	/*
+	 * The extent size threshold for autodefrag.
+	 *
+	 * This value is different for compressed/non-compressed extents, thus
+	 * needs to be passed from higher layer.
+	 * (aka, inode_should_defrag())
+	 */
+	u32 extent_thresh;
+};
+
+static int __compare_inode_defrag(struct inode_defrag *defrag1,
+				  struct inode_defrag *defrag2)
+{
+	if (defrag1->root > defrag2->root)
+		return 1;
+	else if (defrag1->root < defrag2->root)
+		return -1;
+	else if (defrag1->ino > defrag2->ino)
+		return 1;
+	else if (defrag1->ino < defrag2->ino)
+		return -1;
+	else
+		return 0;
+}
+
+/*
+ * Pop a record for an inode into the defrag tree.  The lock must be held
+ * already.
+ *
+ * If you're inserting a record for an older transid than an existing record,
+ * the transid already in the tree is lowered.
+ *
+ * If an existing record is found the defrag item you pass in is freed.
+ */
+static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
+				    struct inode_defrag *defrag)
+{
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+	struct inode_defrag *entry;
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	int ret;
+
+	p = &fs_info->defrag_inodes.rb_node;
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct inode_defrag, rb_node);
+
+		ret = __compare_inode_defrag(defrag, entry);
+		if (ret < 0)
+			p = &parent->rb_left;
+		else if (ret > 0)
+			p = &parent->rb_right;
+		else {
+			/*
+			 * If we're reinserting an entry for an old defrag run,
+			 * make sure to lower the transid of our existing
+			 * record.
+			 */
+			if (defrag->transid < entry->transid)
+				entry->transid = defrag->transid;
+			entry->extent_thresh = min(defrag->extent_thresh,
+						   entry->extent_thresh);
+			return -EEXIST;
+		}
+	}
+	set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
+	rb_link_node(&defrag->rb_node, parent, p);
+	rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
+	return 0;
+}
+
+static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
+{
+	if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
+		return 0;
+
+	if (btrfs_fs_closing(fs_info))
+		return 0;
+
+	return 1;
+}
+
+/*
+ * Insert a defrag record for this inode if auto defrag is enabled.
+ */
+int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
+			   struct btrfs_inode *inode, u32 extent_thresh)
+{
+	struct btrfs_root *root = inode->root;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct inode_defrag *defrag;
+	u64 transid;
+	int ret;
+
+	if (!__need_auto_defrag(fs_info))
+		return 0;
+
+	if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags))
+		return 0;
+
+	if (trans)
+		transid = trans->transid;
+	else
+		transid = inode->root->last_trans;
+
+	defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
+	if (!defrag)
+		return -ENOMEM;
+
+	defrag->ino = btrfs_ino(inode);
+	defrag->transid = transid;
+	defrag->root = root->root_key.objectid;
+	defrag->extent_thresh = extent_thresh;
+
+	spin_lock(&fs_info->defrag_inodes_lock);
+	if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
+		/*
+		 * If we set IN_DEFRAG flag and evict the inode from memory,
+		 * and then re-read this inode, this new inode doesn't have
+		 * IN_DEFRAG flag. At the case, we may find the existed defrag.
+		 */
+		ret = __btrfs_add_inode_defrag(inode, defrag);
+		if (ret)
+			kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+	} else {
+		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+	}
+	spin_unlock(&fs_info->defrag_inodes_lock);
+	return 0;
+}
+
+/*
+ * Pick the defragable inode that we want, if it doesn't exist, we will get the
+ * next one.
+ */
+static struct inode_defrag *btrfs_pick_defrag_inode(
+			struct btrfs_fs_info *fs_info, u64 root, u64 ino)
+{
+	struct inode_defrag *entry = NULL;
+	struct inode_defrag tmp;
+	struct rb_node *p;
+	struct rb_node *parent = NULL;
+	int ret;
+
+	tmp.ino = ino;
+	tmp.root = root;
+
+	spin_lock(&fs_info->defrag_inodes_lock);
+	p = fs_info->defrag_inodes.rb_node;
+	while (p) {
+		parent = p;
+		entry = rb_entry(parent, struct inode_defrag, rb_node);
+
+		ret = __compare_inode_defrag(&tmp, entry);
+		if (ret < 0)
+			p = parent->rb_left;
+		else if (ret > 0)
+			p = parent->rb_right;
+		else
+			goto out;
+	}
+
+	if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
+		parent = rb_next(parent);
+		if (parent)
+			entry = rb_entry(parent, struct inode_defrag, rb_node);
+		else
+			entry = NULL;
+	}
+out:
+	if (entry)
+		rb_erase(parent, &fs_info->defrag_inodes);
+	spin_unlock(&fs_info->defrag_inodes_lock);
+	return entry;
+}
+
+void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
+{
+	struct inode_defrag *defrag;
+	struct rb_node *node;
+
+	spin_lock(&fs_info->defrag_inodes_lock);
+	node = rb_first(&fs_info->defrag_inodes);
+	while (node) {
+		rb_erase(node, &fs_info->defrag_inodes);
+		defrag = rb_entry(node, struct inode_defrag, rb_node);
+		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+
+		cond_resched_lock(&fs_info->defrag_inodes_lock);
+
+		node = rb_first(&fs_info->defrag_inodes);
+	}
+	spin_unlock(&fs_info->defrag_inodes_lock);
+}
+
+#define BTRFS_DEFRAG_BATCH	1024
+
+static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
+				    struct inode_defrag *defrag)
+{
+	struct btrfs_root *inode_root;
+	struct inode *inode;
+	struct btrfs_ioctl_defrag_range_args range;
+	int ret = 0;
+	u64 cur = 0;
+
+again:
+	if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
+		goto cleanup;
+	if (!__need_auto_defrag(fs_info))
+		goto cleanup;
+
+	/* Get the inode */
+	inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
+	if (IS_ERR(inode_root)) {
+		ret = PTR_ERR(inode_root);
+		goto cleanup;
+	}
+
+	inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root);
+	btrfs_put_root(inode_root);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		goto cleanup;
+	}
+
+	if (cur >= i_size_read(inode)) {
+		iput(inode);
+		goto cleanup;
+	}
+
+	/* Do a chunk of defrag */
+	clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
+	memset(&range, 0, sizeof(range));
+	range.len = (u64)-1;
+	range.start = cur;
+	range.extent_thresh = defrag->extent_thresh;
+
+	sb_start_write(fs_info->sb);
+	ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
+				       BTRFS_DEFRAG_BATCH);
+	sb_end_write(fs_info->sb);
+	iput(inode);
+
+	if (ret < 0)
+		goto cleanup;
+
+	cur = max(cur + fs_info->sectorsize, range.start);
+	goto again;
+
+cleanup:
+	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+	return ret;
+}
+
+/*
+ * Run through the list of inodes in the FS that need defragging.
+ */
+int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
+{
+	struct inode_defrag *defrag;
+	u64 first_ino = 0;
+	u64 root_objectid = 0;
+
+	atomic_inc(&fs_info->defrag_running);
+	while (1) {
+		/* Pause the auto defragger. */
+		if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
+			break;
+
+		if (!__need_auto_defrag(fs_info))
+			break;
+
+		/* find an inode to defrag */
+		defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, first_ino);
+		if (!defrag) {
+			if (root_objectid || first_ino) {
+				root_objectid = 0;
+				first_ino = 0;
+				continue;
+			} else {
+				break;
+			}
+		}
+
+		first_ino = defrag->ino + 1;
+		root_objectid = defrag->root;
+
+		__btrfs_run_defrag_inode(fs_info, defrag);
+	}
+	atomic_dec(&fs_info->defrag_running);
+
+	/*
+	 * During unmount, we use the transaction_wait queue to wait for the
+	 * defragger to stop.
+	 */
+	wake_up(&fs_info->transaction_wait);
+	return 0;
+}
+
 /*
  * Defrag all the leaves in a given btree.
  * Read all the leaves and try to get key order to
@@ -131,3 +451,20 @@ done:
 
 	return ret;
 }
+
+void __cold btrfs_auto_defrag_exit(void)
+{
+	kmem_cache_destroy(btrfs_inode_defrag_cachep);
+}
+
+int __init btrfs_auto_defrag_init(void)
+{
+	btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
+					sizeof(struct inode_defrag), 0,
+					SLAB_MEM_SPREAD,
+					NULL);
+	if (!btrfs_inode_defrag_cachep)
+		return -ENOMEM;
+
+	return 0;
+}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index bad676d2e23f..8500472aa6ef 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -34,329 +34,6 @@
 #include "accessors.h"
 #include "extent-tree.h"
 
-static struct kmem_cache *btrfs_inode_defrag_cachep;
-/*
- * when auto defrag is enabled we
- * queue up these defrag structs to remember which
- * inodes need defragging passes
- */
-struct inode_defrag {
-	struct rb_node rb_node;
-	/* objectid */
-	u64 ino;
-	/*
-	 * transid where the defrag was added, we search for
-	 * extents newer than this
-	 */
-	u64 transid;
-
-	/* root objectid */
-	u64 root;
-
-	/*
-	 * The extent size threshold for autodefrag.
-	 *
-	 * This value is different for compressed/non-compressed extents,
-	 * thus needs to be passed from higher layer.
-	 * (aka, inode_should_defrag())
-	 */
-	u32 extent_thresh;
-};
-
-static int __compare_inode_defrag(struct inode_defrag *defrag1,
-				  struct inode_defrag *defrag2)
-{
-	if (defrag1->root > defrag2->root)
-		return 1;
-	else if (defrag1->root < defrag2->root)
-		return -1;
-	else if (defrag1->ino > defrag2->ino)
-		return 1;
-	else if (defrag1->ino < defrag2->ino)
-		return -1;
-	else
-		return 0;
-}
-
-/* pop a record for an inode into the defrag tree.  The lock
- * must be held already
- *
- * If you're inserting a record for an older transid than an
- * existing record, the transid already in the tree is lowered
- *
- * If an existing record is found the defrag item you
- * pass in is freed
- */
-static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
-				    struct inode_defrag *defrag)
-{
-	struct btrfs_fs_info *fs_info = inode->root->fs_info;
-	struct inode_defrag *entry;
-	struct rb_node **p;
-	struct rb_node *parent = NULL;
-	int ret;
-
-	p = &fs_info->defrag_inodes.rb_node;
-	while (*p) {
-		parent = *p;
-		entry = rb_entry(parent, struct inode_defrag, rb_node);
-
-		ret = __compare_inode_defrag(defrag, entry);
-		if (ret < 0)
-			p = &parent->rb_left;
-		else if (ret > 0)
-			p = &parent->rb_right;
-		else {
-			/* if we're reinserting an entry for
-			 * an old defrag run, make sure to
-			 * lower the transid of our existing record
-			 */
-			if (defrag->transid < entry->transid)
-				entry->transid = defrag->transid;
-			entry->extent_thresh = min(defrag->extent_thresh,
-						   entry->extent_thresh);
-			return -EEXIST;
-		}
-	}
-	set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
-	rb_link_node(&defrag->rb_node, parent, p);
-	rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
-	return 0;
-}
-
-static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
-{
-	if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
-		return 0;
-
-	if (btrfs_fs_closing(fs_info))
-		return 0;
-
-	return 1;
-}
-
-/*
- * insert a defrag record for this inode if auto defrag is
- * enabled
- */
-int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
-			   struct btrfs_inode *inode, u32 extent_thresh)
-{
-	struct btrfs_root *root = inode->root;
-	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct inode_defrag *defrag;
-	u64 transid;
-	int ret;
-
-	if (!__need_auto_defrag(fs_info))
-		return 0;
-
-	if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags))
-		return 0;
-
-	if (trans)
-		transid = trans->transid;
-	else
-		transid = inode->root->last_trans;
-
-	defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
-	if (!defrag)
-		return -ENOMEM;
-
-	defrag->ino = btrfs_ino(inode);
-	defrag->transid = transid;
-	defrag->root = root->root_key.objectid;
-	defrag->extent_thresh = extent_thresh;
-
-	spin_lock(&fs_info->defrag_inodes_lock);
-	if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
-		/*
-		 * If we set IN_DEFRAG flag and evict the inode from memory,
-		 * and then re-read this inode, this new inode doesn't have
-		 * IN_DEFRAG flag. At the case, we may find the existed defrag.
-		 */
-		ret = __btrfs_add_inode_defrag(inode, defrag);
-		if (ret)
-			kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
-	} else {
-		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
-	}
-	spin_unlock(&fs_info->defrag_inodes_lock);
-	return 0;
-}
-
-/*
- * pick the defragable inode that we want, if it doesn't exist, we will get
- * the next one.
- */
-static struct inode_defrag *
-btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
-{
-	struct inode_defrag *entry = NULL;
-	struct inode_defrag tmp;
-	struct rb_node *p;
-	struct rb_node *parent = NULL;
-	int ret;
-
-	tmp.ino = ino;
-	tmp.root = root;
-
-	spin_lock(&fs_info->defrag_inodes_lock);
-	p = fs_info->defrag_inodes.rb_node;
-	while (p) {
-		parent = p;
-		entry = rb_entry(parent, struct inode_defrag, rb_node);
-
-		ret = __compare_inode_defrag(&tmp, entry);
-		if (ret < 0)
-			p = parent->rb_left;
-		else if (ret > 0)
-			p = parent->rb_right;
-		else
-			goto out;
-	}
-
-	if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
-		parent = rb_next(parent);
-		if (parent)
-			entry = rb_entry(parent, struct inode_defrag, rb_node);
-		else
-			entry = NULL;
-	}
-out:
-	if (entry)
-		rb_erase(parent, &fs_info->defrag_inodes);
-	spin_unlock(&fs_info->defrag_inodes_lock);
-	return entry;
-}
-
-void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
-{
-	struct inode_defrag *defrag;
-	struct rb_node *node;
-
-	spin_lock(&fs_info->defrag_inodes_lock);
-	node = rb_first(&fs_info->defrag_inodes);
-	while (node) {
-		rb_erase(node, &fs_info->defrag_inodes);
-		defrag = rb_entry(node, struct inode_defrag, rb_node);
-		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
-
-		cond_resched_lock(&fs_info->defrag_inodes_lock);
-
-		node = rb_first(&fs_info->defrag_inodes);
-	}
-	spin_unlock(&fs_info->defrag_inodes_lock);
-}
-
-#define BTRFS_DEFRAG_BATCH	1024
-
-static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
-				    struct inode_defrag *defrag)
-{
-	struct btrfs_root *inode_root;
-	struct inode *inode;
-	struct btrfs_ioctl_defrag_range_args range;
-	int ret = 0;
-	u64 cur = 0;
-
-again:
-	if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
-		goto cleanup;
-	if (!__need_auto_defrag(fs_info))
-		goto cleanup;
-
-	/* get the inode */
-	inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
-	if (IS_ERR(inode_root)) {
-		ret = PTR_ERR(inode_root);
-		goto cleanup;
-	}
-
-	inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root);
-	btrfs_put_root(inode_root);
-	if (IS_ERR(inode)) {
-		ret = PTR_ERR(inode);
-		goto cleanup;
-	}
-
-	if (cur >= i_size_read(inode)) {
-		iput(inode);
-		goto cleanup;
-	}
-
-	/* do a chunk of defrag */
-	clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
-	memset(&range, 0, sizeof(range));
-	range.len = (u64)-1;
-	range.start = cur;
-	range.extent_thresh = defrag->extent_thresh;
-
-	sb_start_write(fs_info->sb);
-	ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
-				       BTRFS_DEFRAG_BATCH);
-	sb_end_write(fs_info->sb);
-	iput(inode);
-
-	if (ret < 0)
-		goto cleanup;
-
-	cur = max(cur + fs_info->sectorsize, range.start);
-	goto again;
-
-cleanup:
-	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
-	return ret;
-}
-
-/*
- * run through the list of inodes in the FS that need
- * defragging
- */
-int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
-{
-	struct inode_defrag *defrag;
-	u64 first_ino = 0;
-	u64 root_objectid = 0;
-
-	atomic_inc(&fs_info->defrag_running);
-	while (1) {
-		/* Pause the auto defragger. */
-		if (test_bit(BTRFS_FS_STATE_REMOUNTING,
-			     &fs_info->fs_state))
-			break;
-
-		if (!__need_auto_defrag(fs_info))
-			break;
-
-		/* find an inode to defrag */
-		defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
-						 first_ino);
-		if (!defrag) {
-			if (root_objectid || first_ino) {
-				root_objectid = 0;
-				first_ino = 0;
-				continue;
-			} else {
-				break;
-			}
-		}
-
-		first_ino = defrag->ino + 1;
-		root_objectid = defrag->root;
-
-		__btrfs_run_defrag_inode(fs_info, defrag);
-	}
-	atomic_dec(&fs_info->defrag_running);
-
-	/*
-	 * during unmount, we use the transaction_wait queue to
-	 * wait for the defragger to stop
-	 */
-	wake_up(&fs_info->transaction_wait);
-	return 0;
-}
-
 /* simple helper to fault in pages and copy.  This should go away
  * and be replaced with calls into generic code.
  */
@@ -4153,23 +3830,6 @@ const struct file_operations btrfs_file_operations = {
 	.remap_file_range = btrfs_remap_file_range,
 };
 
-void __cold btrfs_auto_defrag_exit(void)
-{
-	kmem_cache_destroy(btrfs_inode_defrag_cachep);
-}
-
-int __init btrfs_auto_defrag_init(void)
-{
-	btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
-					sizeof(struct inode_defrag), 0,
-					SLAB_MEM_SPREAD,
-					NULL);
-	if (!btrfs_inode_defrag_cachep)
-		return -ENOMEM;
-
-	return 0;
-}
-
 int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
 {
 	int ret;
-- 
cgit 


From a6a01ca61f4949037fa7b94278ab260eab02a289 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 26 Oct 2022 15:08:24 -0400
Subject: btrfs: move the file defrag code into defrag.c

This is the other big portion of defrag code that has existed in
ioctl.c.  Move it to its new home in defrag.c.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/defrag.c | 903 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/ioctl.c  | 902 -----------------------------------------------------
 2 files changed, 903 insertions(+), 902 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index 4c8c03185ce7..2c260eef40d4 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -10,6 +10,9 @@
 #include "transaction.h"
 #include "locking.h"
 #include "accessors.h"
+#include "messages.h"
+#include "delalloc-space.h"
+#include "subpage.h"
 
 static struct kmem_cache *btrfs_inode_defrag_cachep;
 
@@ -452,6 +455,906 @@ done:
 	return ret;
 }
 
+/*
+ * Defrag specific helper to get an extent map.
+ *
+ * Differences between this and btrfs_get_extent() are:
+ *
+ * - No extent_map will be added to inode->extent_tree
+ *   To reduce memory usage in the long run.
+ *
+ * - Extra optimization to skip file extents older than @newer_than
+ *   By using btrfs_search_forward() we can skip entire file ranges that
+ *   have extents created in past transactions, because btrfs_search_forward()
+ *   will not visit leaves and nodes with a generation smaller than given
+ *   minimal generation threshold (@newer_than).
+ *
+ * Return valid em if we find a file extent matching the requirement.
+ * Return NULL if we can not find a file extent matching the requirement.
+ *
+ * Return ERR_PTR() for error.
+ */
+static struct extent_map *defrag_get_extent(struct btrfs_inode *inode,
+					    u64 start, u64 newer_than)
+{
+	struct btrfs_root *root = inode->root;
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_path path = { 0 };
+	struct extent_map *em;
+	struct btrfs_key key;
+	u64 ino = btrfs_ino(inode);
+	int ret;
+
+	em = alloc_extent_map();
+	if (!em) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	key.objectid = ino;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = start;
+
+	if (newer_than) {
+		ret = btrfs_search_forward(root, &key, &path, newer_than);
+		if (ret < 0)
+			goto err;
+		/* Can't find anything newer */
+		if (ret > 0)
+			goto not_found;
+	} else {
+		ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
+		if (ret < 0)
+			goto err;
+	}
+	if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
+		/*
+		 * If btrfs_search_slot() makes path to point beyond nritems,
+		 * we should not have an empty leaf, as this inode must at
+		 * least have its INODE_ITEM.
+		 */
+		ASSERT(btrfs_header_nritems(path.nodes[0]));
+		path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1;
+	}
+	btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
+	/* Perfect match, no need to go one slot back */
+	if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY &&
+	    key.offset == start)
+		goto iterate;
+
+	/* We didn't find a perfect match, needs to go one slot back */
+	if (path.slots[0] > 0) {
+		btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
+		if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
+			path.slots[0]--;
+	}
+
+iterate:
+	/* Iterate through the path to find a file extent covering @start */
+	while (true) {
+		u64 extent_end;
+
+		if (path.slots[0] >= btrfs_header_nritems(path.nodes[0]))
+			goto next;
+
+		btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
+
+		/*
+		 * We may go one slot back to INODE_REF/XATTR item, then
+		 * need to go forward until we reach an EXTENT_DATA.
+		 * But we should still has the correct ino as key.objectid.
+		 */
+		if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY)
+			goto next;
+
+		/* It's beyond our target range, definitely not extent found */
+		if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY)
+			goto not_found;
+
+		/*
+		 *	|	|<- File extent ->|
+		 *	\- start
+		 *
+		 * This means there is a hole between start and key.offset.
+		 */
+		if (key.offset > start) {
+			em->start = start;
+			em->orig_start = start;
+			em->block_start = EXTENT_MAP_HOLE;
+			em->len = key.offset - start;
+			break;
+		}
+
+		fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
+				    struct btrfs_file_extent_item);
+		extent_end = btrfs_file_extent_end(&path);
+
+		/*
+		 *	|<- file extent ->|	|
+		 *				\- start
+		 *
+		 * We haven't reached start, search next slot.
+		 */
+		if (extent_end <= start)
+			goto next;
+
+		/* Now this extent covers @start, convert it to em */
+		btrfs_extent_item_to_extent_map(inode, &path, fi, false, em);
+		break;
+next:
+		ret = btrfs_next_item(root, &path);
+		if (ret < 0)
+			goto err;
+		if (ret > 0)
+			goto not_found;
+	}
+	btrfs_release_path(&path);
+	return em;
+
+not_found:
+	btrfs_release_path(&path);
+	free_extent_map(em);
+	return NULL;
+
+err:
+	btrfs_release_path(&path);
+	free_extent_map(em);
+	return ERR_PTR(ret);
+}
+
+static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
+					       u64 newer_than, bool locked)
+{
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct extent_map *em;
+	const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize;
+
+	/*
+	 * Hopefully we have this extent in the tree already, try without the
+	 * full extent lock.
+	 */
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, start, sectorsize);
+	read_unlock(&em_tree->lock);
+
+	/*
+	 * We can get a merged extent, in that case, we need to re-search
+	 * tree to get the original em for defrag.
+	 *
+	 * If @newer_than is 0 or em::generation < newer_than, we can trust
+	 * this em, as either we don't care about the generation, or the
+	 * merged extent map will be rejected anyway.
+	 */
+	if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) &&
+	    newer_than && em->generation >= newer_than) {
+		free_extent_map(em);
+		em = NULL;
+	}
+
+	if (!em) {
+		struct extent_state *cached = NULL;
+		u64 end = start + sectorsize - 1;
+
+		/* Get the big lock and read metadata off disk. */
+		if (!locked)
+			lock_extent(io_tree, start, end, &cached);
+		em = defrag_get_extent(BTRFS_I(inode), start, newer_than);
+		if (!locked)
+			unlock_extent(io_tree, start, end, &cached);
+
+		if (IS_ERR(em))
+			return NULL;
+	}
+
+	return em;
+}
+
+static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info,
+				   const struct extent_map *em)
+{
+	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+		return BTRFS_MAX_COMPRESSED;
+	return fs_info->max_extent_size;
+}
+
+static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
+				     u32 extent_thresh, u64 newer_than, bool locked)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct extent_map *next;
+	bool ret = false;
+
+	/* This is the last extent */
+	if (em->start + em->len >= i_size_read(inode))
+		return false;
+
+	/*
+	 * Here we need to pass @newer_then when checking the next extent, or
+	 * we will hit a case we mark current extent for defrag, but the next
+	 * one will not be a target.
+	 * This will just cause extra IO without really reducing the fragments.
+	 */
+	next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked);
+	/* No more em or hole */
+	if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
+		goto out;
+	if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags))
+		goto out;
+	/*
+	 * If the next extent is at its max capacity, defragging current extent
+	 * makes no sense, as the total number of extents won't change.
+	 */
+	if (next->len >= get_extent_max_capacity(fs_info, em))
+		goto out;
+	/* Skip older extent */
+	if (next->generation < newer_than)
+		goto out;
+	/* Also check extent size */
+	if (next->len >= extent_thresh)
+		goto out;
+
+	ret = true;
+out:
+	free_extent_map(next);
+	return ret;
+}
+
+/*
+ * Prepare one page to be defragged.
+ *
+ * This will ensure:
+ *
+ * - Returned page is locked and has been set up properly.
+ * - No ordered extent exists in the page.
+ * - The page is uptodate.
+ *
+ * NOTE: Caller should also wait for page writeback after the cluster is
+ * prepared, here we don't do writeback wait for each page.
+ */
+static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t index)
+{
+	struct address_space *mapping = inode->vfs_inode.i_mapping;
+	gfp_t mask = btrfs_alloc_write_mask(mapping);
+	u64 page_start = (u64)index << PAGE_SHIFT;
+	u64 page_end = page_start + PAGE_SIZE - 1;
+	struct extent_state *cached_state = NULL;
+	struct page *page;
+	int ret;
+
+again:
+	page = find_or_create_page(mapping, index, mask);
+	if (!page)
+		return ERR_PTR(-ENOMEM);
+
+	/*
+	 * Since we can defragment files opened read-only, we can encounter
+	 * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We
+	 * can't do I/O using huge pages yet, so return an error for now.
+	 * Filesystem transparent huge pages are typically only used for
+	 * executables that explicitly enable them, so this isn't very
+	 * restrictive.
+	 */
+	if (PageCompound(page)) {
+		unlock_page(page);
+		put_page(page);
+		return ERR_PTR(-ETXTBSY);
+	}
+
+	ret = set_page_extent_mapped(page);
+	if (ret < 0) {
+		unlock_page(page);
+		put_page(page);
+		return ERR_PTR(ret);
+	}
+
+	/* Wait for any existing ordered extent in the range */
+	while (1) {
+		struct btrfs_ordered_extent *ordered;
+
+		lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
+		ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
+		unlock_extent(&inode->io_tree, page_start, page_end,
+			      &cached_state);
+		if (!ordered)
+			break;
+
+		unlock_page(page);
+		btrfs_start_ordered_extent(ordered, 1);
+		btrfs_put_ordered_extent(ordered);
+		lock_page(page);
+		/*
+		 * We unlocked the page above, so we need check if it was
+		 * released or not.
+		 */
+		if (page->mapping != mapping || !PagePrivate(page)) {
+			unlock_page(page);
+			put_page(page);
+			goto again;
+		}
+	}
+
+	/*
+	 * Now the page range has no ordered extent any more.  Read the page to
+	 * make it uptodate.
+	 */
+	if (!PageUptodate(page)) {
+		btrfs_read_folio(NULL, page_folio(page));
+		lock_page(page);
+		if (page->mapping != mapping || !PagePrivate(page)) {
+			unlock_page(page);
+			put_page(page);
+			goto again;
+		}
+		if (!PageUptodate(page)) {
+			unlock_page(page);
+			put_page(page);
+			return ERR_PTR(-EIO);
+		}
+	}
+	return page;
+}
+
+struct defrag_target_range {
+	struct list_head list;
+	u64 start;
+	u64 len;
+};
+
+/*
+ * Collect all valid target extents.
+ *
+ * @start:	   file offset to lookup
+ * @len:	   length to lookup
+ * @extent_thresh: file extent size threshold, any extent size >= this value
+ *		   will be ignored
+ * @newer_than:    only defrag extents newer than this value
+ * @do_compress:   whether the defrag is doing compression
+ *		   if true, @extent_thresh will be ignored and all regular
+ *		   file extents meeting @newer_than will be targets.
+ * @locked:	   if the range has already held extent lock
+ * @target_list:   list of targets file extents
+ */
+static int defrag_collect_targets(struct btrfs_inode *inode,
+				  u64 start, u64 len, u32 extent_thresh,
+				  u64 newer_than, bool do_compress,
+				  bool locked, struct list_head *target_list,
+				  u64 *last_scanned_ret)
+{
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+	bool last_is_target = false;
+	u64 cur = start;
+	int ret = 0;
+
+	while (cur < start + len) {
+		struct extent_map *em;
+		struct defrag_target_range *new;
+		bool next_mergeable = true;
+		u64 range_len;
+
+		last_is_target = false;
+		em = defrag_lookup_extent(&inode->vfs_inode, cur, newer_than, locked);
+		if (!em)
+			break;
+
+		/*
+		 * If the file extent is an inlined one, we may still want to
+		 * defrag it (fallthrough) if it will cause a regular extent.
+		 * This is for users who want to convert inline extents to
+		 * regular ones through max_inline= mount option.
+		 */
+		if (em->block_start == EXTENT_MAP_INLINE &&
+		    em->len <= inode->root->fs_info->max_inline)
+			goto next;
+
+		/* Skip hole/delalloc/preallocated extents */
+		if (em->block_start == EXTENT_MAP_HOLE ||
+		    em->block_start == EXTENT_MAP_DELALLOC ||
+		    test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+			goto next;
+
+		/* Skip older extent */
+		if (em->generation < newer_than)
+			goto next;
+
+		/* This em is under writeback, no need to defrag */
+		if (em->generation == (u64)-1)
+			goto next;
+
+		/*
+		 * Our start offset might be in the middle of an existing extent
+		 * map, so take that into account.
+		 */
+		range_len = em->len - (cur - em->start);
+		/*
+		 * If this range of the extent map is already flagged for delalloc,
+		 * skip it, because:
+		 *
+		 * 1) We could deadlock later, when trying to reserve space for
+		 *    delalloc, because in case we can't immediately reserve space
+		 *    the flusher can start delalloc and wait for the respective
+		 *    ordered extents to complete. The deadlock would happen
+		 *    because we do the space reservation while holding the range
+		 *    locked, and starting writeback, or finishing an ordered
+		 *    extent, requires locking the range;
+		 *
+		 * 2) If there's delalloc there, it means there's dirty pages for
+		 *    which writeback has not started yet (we clean the delalloc
+		 *    flag when starting writeback and after creating an ordered
+		 *    extent). If we mark pages in an adjacent range for defrag,
+		 *    then we will have a larger contiguous range for delalloc,
+		 *    very likely resulting in a larger extent after writeback is
+		 *    triggered (except in a case of free space fragmentation).
+		 */
+		if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1,
+				   EXTENT_DELALLOC, 0, NULL))
+			goto next;
+
+		/*
+		 * For do_compress case, we want to compress all valid file
+		 * extents, thus no @extent_thresh or mergeable check.
+		 */
+		if (do_compress)
+			goto add;
+
+		/* Skip too large extent */
+		if (range_len >= extent_thresh)
+			goto next;
+
+		/*
+		 * Skip extents already at its max capacity, this is mostly for
+		 * compressed extents, which max cap is only 128K.
+		 */
+		if (em->len >= get_extent_max_capacity(fs_info, em))
+			goto next;
+
+		/*
+		 * Normally there are no more extents after an inline one, thus
+		 * @next_mergeable will normally be false and not defragged.
+		 * So if an inline extent passed all above checks, just add it
+		 * for defrag, and be converted to regular extents.
+		 */
+		if (em->block_start == EXTENT_MAP_INLINE)
+			goto add;
+
+		next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
+						extent_thresh, newer_than, locked);
+		if (!next_mergeable) {
+			struct defrag_target_range *last;
+
+			/* Empty target list, no way to merge with last entry */
+			if (list_empty(target_list))
+				goto next;
+			last = list_entry(target_list->prev,
+					  struct defrag_target_range, list);
+			/* Not mergeable with last entry */
+			if (last->start + last->len != cur)
+				goto next;
+
+			/* Mergeable, fall through to add it to @target_list. */
+		}
+
+add:
+		last_is_target = true;
+		range_len = min(extent_map_end(em), start + len) - cur;
+		/*
+		 * This one is a good target, check if it can be merged into
+		 * last range of the target list.
+		 */
+		if (!list_empty(target_list)) {
+			struct defrag_target_range *last;
+
+			last = list_entry(target_list->prev,
+					  struct defrag_target_range, list);
+			ASSERT(last->start + last->len <= cur);
+			if (last->start + last->len == cur) {
+				/* Mergeable, enlarge the last entry */
+				last->len += range_len;
+				goto next;
+			}
+			/* Fall through to allocate a new entry */
+		}
+
+		/* Allocate new defrag_target_range */
+		new = kmalloc(sizeof(*new), GFP_NOFS);
+		if (!new) {
+			free_extent_map(em);
+			ret = -ENOMEM;
+			break;
+		}
+		new->start = cur;
+		new->len = range_len;
+		list_add_tail(&new->list, target_list);
+
+next:
+		cur = extent_map_end(em);
+		free_extent_map(em);
+	}
+	if (ret < 0) {
+		struct defrag_target_range *entry;
+		struct defrag_target_range *tmp;
+
+		list_for_each_entry_safe(entry, tmp, target_list, list) {
+			list_del_init(&entry->list);
+			kfree(entry);
+		}
+	}
+	if (!ret && last_scanned_ret) {
+		/*
+		 * If the last extent is not a target, the caller can skip to
+		 * the end of that extent.
+		 * Otherwise, we can only go the end of the specified range.
+		 */
+		if (!last_is_target)
+			*last_scanned_ret = max(cur, *last_scanned_ret);
+		else
+			*last_scanned_ret = max(start + len, *last_scanned_ret);
+	}
+	return ret;
+}
+
+#define CLUSTER_SIZE	(SZ_256K)
+static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
+
+/*
+ * Defrag one contiguous target range.
+ *
+ * @inode:	target inode
+ * @target:	target range to defrag
+ * @pages:	locked pages covering the defrag range
+ * @nr_pages:	number of locked pages
+ *
+ * Caller should ensure:
+ *
+ * - Pages are prepared
+ *   Pages should be locked, no ordered extent in the pages range,
+ *   no writeback.
+ *
+ * - Extent bits are locked
+ */
+static int defrag_one_locked_target(struct btrfs_inode *inode,
+				    struct defrag_target_range *target,
+				    struct page **pages, int nr_pages,
+				    struct extent_state **cached_state)
+{
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+	struct extent_changeset *data_reserved = NULL;
+	const u64 start = target->start;
+	const u64 len = target->len;
+	unsigned long last_index = (start + len - 1) >> PAGE_SHIFT;
+	unsigned long start_index = start >> PAGE_SHIFT;
+	unsigned long first_index = page_index(pages[0]);
+	int ret = 0;
+	int i;
+
+	ASSERT(last_index - first_index + 1 <= nr_pages);
+
+	ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len);
+	if (ret < 0)
+		return ret;
+	clear_extent_bit(&inode->io_tree, start, start + len - 1,
+			 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+			 EXTENT_DEFRAG, cached_state);
+	set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state);
+
+	/* Update the page status */
+	for (i = start_index - first_index; i <= last_index - first_index; i++) {
+		ClearPageChecked(pages[i]);
+		btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len);
+	}
+	btrfs_delalloc_release_extents(inode, len);
+	extent_changeset_free(data_reserved);
+
+	return ret;
+}
+
+static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
+			    u32 extent_thresh, u64 newer_than, bool do_compress,
+			    u64 *last_scanned_ret)
+{
+	struct extent_state *cached_state = NULL;
+	struct defrag_target_range *entry;
+	struct defrag_target_range *tmp;
+	LIST_HEAD(target_list);
+	struct page **pages;
+	const u32 sectorsize = inode->root->fs_info->sectorsize;
+	u64 last_index = (start + len - 1) >> PAGE_SHIFT;
+	u64 start_index = start >> PAGE_SHIFT;
+	unsigned int nr_pages = last_index - start_index + 1;
+	int ret = 0;
+	int i;
+
+	ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE);
+	ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize));
+
+	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
+	if (!pages)
+		return -ENOMEM;
+
+	/* Prepare all pages */
+	for (i = 0; i < nr_pages; i++) {
+		pages[i] = defrag_prepare_one_page(inode, start_index + i);
+		if (IS_ERR(pages[i])) {
+			ret = PTR_ERR(pages[i]);
+			pages[i] = NULL;
+			goto free_pages;
+		}
+	}
+	for (i = 0; i < nr_pages; i++)
+		wait_on_page_writeback(pages[i]);
+
+	/* Lock the pages range */
+	lock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
+		    (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+		    &cached_state);
+	/*
+	 * Now we have a consistent view about the extent map, re-check
+	 * which range really needs to be defragged.
+	 *
+	 * And this time we have extent locked already, pass @locked = true
+	 * so that we won't relock the extent range and cause deadlock.
+	 */
+	ret = defrag_collect_targets(inode, start, len, extent_thresh,
+				     newer_than, do_compress, true,
+				     &target_list, last_scanned_ret);
+	if (ret < 0)
+		goto unlock_extent;
+
+	list_for_each_entry(entry, &target_list, list) {
+		ret = defrag_one_locked_target(inode, entry, pages, nr_pages,
+					       &cached_state);
+		if (ret < 0)
+			break;
+	}
+
+	list_for_each_entry_safe(entry, tmp, &target_list, list) {
+		list_del_init(&entry->list);
+		kfree(entry);
+	}
+unlock_extent:
+	unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
+		      (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+		      &cached_state);
+free_pages:
+	for (i = 0; i < nr_pages; i++) {
+		if (pages[i]) {
+			unlock_page(pages[i]);
+			put_page(pages[i]);
+		}
+	}
+	kfree(pages);
+	return ret;
+}
+
+static int defrag_one_cluster(struct btrfs_inode *inode,
+			      struct file_ra_state *ra,
+			      u64 start, u32 len, u32 extent_thresh,
+			      u64 newer_than, bool do_compress,
+			      unsigned long *sectors_defragged,
+			      unsigned long max_sectors,
+			      u64 *last_scanned_ret)
+{
+	const u32 sectorsize = inode->root->fs_info->sectorsize;
+	struct defrag_target_range *entry;
+	struct defrag_target_range *tmp;
+	LIST_HEAD(target_list);
+	int ret;
+
+	ret = defrag_collect_targets(inode, start, len, extent_thresh,
+				     newer_than, do_compress, false,
+				     &target_list, NULL);
+	if (ret < 0)
+		goto out;
+
+	list_for_each_entry(entry, &target_list, list) {
+		u32 range_len = entry->len;
+
+		/* Reached or beyond the limit */
+		if (max_sectors && *sectors_defragged >= max_sectors) {
+			ret = 1;
+			break;
+		}
+
+		if (max_sectors)
+			range_len = min_t(u32, range_len,
+				(max_sectors - *sectors_defragged) * sectorsize);
+
+		/*
+		 * If defrag_one_range() has updated last_scanned_ret,
+		 * our range may already be invalid (e.g. hole punched).
+		 * Skip if our range is before last_scanned_ret, as there is
+		 * no need to defrag the range anymore.
+		 */
+		if (entry->start + range_len <= *last_scanned_ret)
+			continue;
+
+		if (ra)
+			page_cache_sync_readahead(inode->vfs_inode.i_mapping,
+				ra, NULL, entry->start >> PAGE_SHIFT,
+				((entry->start + range_len - 1) >> PAGE_SHIFT) -
+				(entry->start >> PAGE_SHIFT) + 1);
+		/*
+		 * Here we may not defrag any range if holes are punched before
+		 * we locked the pages.
+		 * But that's fine, it only affects the @sectors_defragged
+		 * accounting.
+		 */
+		ret = defrag_one_range(inode, entry->start, range_len,
+				       extent_thresh, newer_than, do_compress,
+				       last_scanned_ret);
+		if (ret < 0)
+			break;
+		*sectors_defragged += range_len >>
+				      inode->root->fs_info->sectorsize_bits;
+	}
+out:
+	list_for_each_entry_safe(entry, tmp, &target_list, list) {
+		list_del_init(&entry->list);
+		kfree(entry);
+	}
+	if (ret >= 0)
+		*last_scanned_ret = max(*last_scanned_ret, start + len);
+	return ret;
+}
+
+/*
+ * Entry point to file defragmentation.
+ *
+ * @inode:	   inode to be defragged
+ * @ra:		   readahead state (can be NUL)
+ * @range:	   defrag options including range and flags
+ * @newer_than:	   minimum transid to defrag
+ * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
+ *		   will be defragged.
+ *
+ * Return <0 for error.
+ * Return >=0 for the number of sectors defragged, and range->start will be updated
+ * to indicate the file offset where next defrag should be started at.
+ * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without
+ *  defragging all the range).
+ */
+int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
+		      struct btrfs_ioctl_defrag_range_args *range,
+		      u64 newer_than, unsigned long max_to_defrag)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	unsigned long sectors_defragged = 0;
+	u64 isize = i_size_read(inode);
+	u64 cur;
+	u64 last_byte;
+	bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS);
+	bool ra_allocated = false;
+	int compress_type = BTRFS_COMPRESS_ZLIB;
+	int ret = 0;
+	u32 extent_thresh = range->extent_thresh;
+	pgoff_t start_index;
+
+	if (isize == 0)
+		return 0;
+
+	if (range->start >= isize)
+		return -EINVAL;
+
+	if (do_compress) {
+		if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES)
+			return -EINVAL;
+		if (range->compress_type)
+			compress_type = range->compress_type;
+	}
+
+	if (extent_thresh == 0)
+		extent_thresh = SZ_256K;
+
+	if (range->start + range->len > range->start) {
+		/* Got a specific range */
+		last_byte = min(isize, range->start + range->len);
+	} else {
+		/* Defrag until file end */
+		last_byte = isize;
+	}
+
+	/* Align the range */
+	cur = round_down(range->start, fs_info->sectorsize);
+	last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
+
+	/*
+	 * If we were not given a ra, allocate a readahead context. As
+	 * readahead is just an optimization, defrag will work without it so
+	 * we don't error out.
+	 */
+	if (!ra) {
+		ra_allocated = true;
+		ra = kzalloc(sizeof(*ra), GFP_KERNEL);
+		if (ra)
+			file_ra_state_init(ra, inode->i_mapping);
+	}
+
+	/*
+	 * Make writeback start from the beginning of the range, so that the
+	 * defrag range can be written sequentially.
+	 */
+	start_index = cur >> PAGE_SHIFT;
+	if (start_index < inode->i_mapping->writeback_index)
+		inode->i_mapping->writeback_index = start_index;
+
+	while (cur < last_byte) {
+		const unsigned long prev_sectors_defragged = sectors_defragged;
+		u64 last_scanned = cur;
+		u64 cluster_end;
+
+		if (btrfs_defrag_cancelled(fs_info)) {
+			ret = -EAGAIN;
+			break;
+		}
+
+		/* We want the cluster end at page boundary when possible */
+		cluster_end = (((cur >> PAGE_SHIFT) +
+			       (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1;
+		cluster_end = min(cluster_end, last_byte);
+
+		btrfs_inode_lock(inode, 0);
+		if (IS_SWAPFILE(inode)) {
+			ret = -ETXTBSY;
+			btrfs_inode_unlock(inode, 0);
+			break;
+		}
+		if (!(inode->i_sb->s_flags & SB_ACTIVE)) {
+			btrfs_inode_unlock(inode, 0);
+			break;
+		}
+		if (do_compress)
+			BTRFS_I(inode)->defrag_compress = compress_type;
+		ret = defrag_one_cluster(BTRFS_I(inode), ra, cur,
+				cluster_end + 1 - cur, extent_thresh,
+				newer_than, do_compress, &sectors_defragged,
+				max_to_defrag, &last_scanned);
+
+		if (sectors_defragged > prev_sectors_defragged)
+			balance_dirty_pages_ratelimited(inode->i_mapping);
+
+		btrfs_inode_unlock(inode, 0);
+		if (ret < 0)
+			break;
+		cur = max(cluster_end + 1, last_scanned);
+		if (ret > 0) {
+			ret = 0;
+			break;
+		}
+		cond_resched();
+	}
+
+	if (ra_allocated)
+		kfree(ra);
+	/*
+	 * Update range.start for autodefrag, this will indicate where to start
+	 * in next run.
+	 */
+	range->start = cur;
+	if (sectors_defragged) {
+		/*
+		 * We have defragged some sectors, for compression case they
+		 * need to be written back immediately.
+		 */
+		if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) {
+			filemap_flush(inode->i_mapping);
+			if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+				     &BTRFS_I(inode)->runtime_flags))
+				filemap_flush(inode->i_mapping);
+		}
+		if (range->compress_type == BTRFS_COMPRESS_LZO)
+			btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
+		else if (range->compress_type == BTRFS_COMPRESS_ZSTD)
+			btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
+		ret = sectors_defragged;
+	}
+	if (do_compress) {
+		btrfs_inode_lock(inode, 0);
+		BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
+		btrfs_inode_unlock(inode, 0);
+	}
+	return ret;
+}
+
 void __cold btrfs_auto_defrag_exit(void)
 {
 	kmem_cache_destroy(btrfs_inode_defrag_cachep);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 96858240588d..42453dd2ed31 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1039,908 +1039,6 @@ out:
 	return ret;
 }
 
-/*
- * Defrag specific helper to get an extent map.
- *
- * Differences between this and btrfs_get_extent() are:
- *
- * - No extent_map will be added to inode->extent_tree
- *   To reduce memory usage in the long run.
- *
- * - Extra optimization to skip file extents older than @newer_than
- *   By using btrfs_search_forward() we can skip entire file ranges that
- *   have extents created in past transactions, because btrfs_search_forward()
- *   will not visit leaves and nodes with a generation smaller than given
- *   minimal generation threshold (@newer_than).
- *
- * Return valid em if we find a file extent matching the requirement.
- * Return NULL if we can not find a file extent matching the requirement.
- *
- * Return ERR_PTR() for error.
- */
-static struct extent_map *defrag_get_extent(struct btrfs_inode *inode,
-					    u64 start, u64 newer_than)
-{
-	struct btrfs_root *root = inode->root;
-	struct btrfs_file_extent_item *fi;
-	struct btrfs_path path = { 0 };
-	struct extent_map *em;
-	struct btrfs_key key;
-	u64 ino = btrfs_ino(inode);
-	int ret;
-
-	em = alloc_extent_map();
-	if (!em) {
-		ret = -ENOMEM;
-		goto err;
-	}
-
-	key.objectid = ino;
-	key.type = BTRFS_EXTENT_DATA_KEY;
-	key.offset = start;
-
-	if (newer_than) {
-		ret = btrfs_search_forward(root, &key, &path, newer_than);
-		if (ret < 0)
-			goto err;
-		/* Can't find anything newer */
-		if (ret > 0)
-			goto not_found;
-	} else {
-		ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
-		if (ret < 0)
-			goto err;
-	}
-	if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
-		/*
-		 * If btrfs_search_slot() makes path to point beyond nritems,
-		 * we should not have an empty leaf, as this inode must at
-		 * least have its INODE_ITEM.
-		 */
-		ASSERT(btrfs_header_nritems(path.nodes[0]));
-		path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1;
-	}
-	btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
-	/* Perfect match, no need to go one slot back */
-	if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY &&
-	    key.offset == start)
-		goto iterate;
-
-	/* We didn't find a perfect match, needs to go one slot back */
-	if (path.slots[0] > 0) {
-		btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
-		if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
-			path.slots[0]--;
-	}
-
-iterate:
-	/* Iterate through the path to find a file extent covering @start */
-	while (true) {
-		u64 extent_end;
-
-		if (path.slots[0] >= btrfs_header_nritems(path.nodes[0]))
-			goto next;
-
-		btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
-
-		/*
-		 * We may go one slot back to INODE_REF/XATTR item, then
-		 * need to go forward until we reach an EXTENT_DATA.
-		 * But we should still has the correct ino as key.objectid.
-		 */
-		if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY)
-			goto next;
-
-		/* It's beyond our target range, definitely not extent found */
-		if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY)
-			goto not_found;
-
-		/*
-		 *	|	|<- File extent ->|
-		 *	\- start
-		 *
-		 * This means there is a hole between start and key.offset.
-		 */
-		if (key.offset > start) {
-			em->start = start;
-			em->orig_start = start;
-			em->block_start = EXTENT_MAP_HOLE;
-			em->len = key.offset - start;
-			break;
-		}
-
-		fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
-				    struct btrfs_file_extent_item);
-		extent_end = btrfs_file_extent_end(&path);
-
-		/*
-		 *	|<- file extent ->|	|
-		 *				\- start
-		 *
-		 * We haven't reached start, search next slot.
-		 */
-		if (extent_end <= start)
-			goto next;
-
-		/* Now this extent covers @start, convert it to em */
-		btrfs_extent_item_to_extent_map(inode, &path, fi, false, em);
-		break;
-next:
-		ret = btrfs_next_item(root, &path);
-		if (ret < 0)
-			goto err;
-		if (ret > 0)
-			goto not_found;
-	}
-	btrfs_release_path(&path);
-	return em;
-
-not_found:
-	btrfs_release_path(&path);
-	free_extent_map(em);
-	return NULL;
-
-err:
-	btrfs_release_path(&path);
-	free_extent_map(em);
-	return ERR_PTR(ret);
-}
-
-static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
-					       u64 newer_than, bool locked)
-{
-	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-	struct extent_map *em;
-	const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize;
-
-	/*
-	 * hopefully we have this extent in the tree already, try without
-	 * the full extent lock
-	 */
-	read_lock(&em_tree->lock);
-	em = lookup_extent_mapping(em_tree, start, sectorsize);
-	read_unlock(&em_tree->lock);
-
-	/*
-	 * We can get a merged extent, in that case, we need to re-search
-	 * tree to get the original em for defrag.
-	 *
-	 * If @newer_than is 0 or em::generation < newer_than, we can trust
-	 * this em, as either we don't care about the generation, or the
-	 * merged extent map will be rejected anyway.
-	 */
-	if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) &&
-	    newer_than && em->generation >= newer_than) {
-		free_extent_map(em);
-		em = NULL;
-	}
-
-	if (!em) {
-		struct extent_state *cached = NULL;
-		u64 end = start + sectorsize - 1;
-
-		/* get the big lock and read metadata off disk */
-		if (!locked)
-			lock_extent(io_tree, start, end, &cached);
-		em = defrag_get_extent(BTRFS_I(inode), start, newer_than);
-		if (!locked)
-			unlock_extent(io_tree, start, end, &cached);
-
-		if (IS_ERR(em))
-			return NULL;
-	}
-
-	return em;
-}
-
-static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info,
-				   const struct extent_map *em)
-{
-	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
-		return BTRFS_MAX_COMPRESSED;
-	return fs_info->max_extent_size;
-}
-
-static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
-				     u32 extent_thresh, u64 newer_than, bool locked)
-{
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-	struct extent_map *next;
-	bool ret = false;
-
-	/* this is the last extent */
-	if (em->start + em->len >= i_size_read(inode))
-		return false;
-
-	/*
-	 * Here we need to pass @newer_then when checking the next extent, or
-	 * we will hit a case we mark current extent for defrag, but the next
-	 * one will not be a target.
-	 * This will just cause extra IO without really reducing the fragments.
-	 */
-	next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked);
-	/* No more em or hole */
-	if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
-		goto out;
-	if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags))
-		goto out;
-	/*
-	 * If the next extent is at its max capacity, defragging current extent
-	 * makes no sense, as the total number of extents won't change.
-	 */
-	if (next->len >= get_extent_max_capacity(fs_info, em))
-		goto out;
-	/* Skip older extent */
-	if (next->generation < newer_than)
-		goto out;
-	/* Also check extent size */
-	if (next->len >= extent_thresh)
-		goto out;
-
-	ret = true;
-out:
-	free_extent_map(next);
-	return ret;
-}
-
-/*
- * Prepare one page to be defragged.
- *
- * This will ensure:
- *
- * - Returned page is locked and has been set up properly.
- * - No ordered extent exists in the page.
- * - The page is uptodate.
- *
- * NOTE: Caller should also wait for page writeback after the cluster is
- * prepared, here we don't do writeback wait for each page.
- */
-static struct page *defrag_prepare_one_page(struct btrfs_inode *inode,
-					    pgoff_t index)
-{
-	struct address_space *mapping = inode->vfs_inode.i_mapping;
-	gfp_t mask = btrfs_alloc_write_mask(mapping);
-	u64 page_start = (u64)index << PAGE_SHIFT;
-	u64 page_end = page_start + PAGE_SIZE - 1;
-	struct extent_state *cached_state = NULL;
-	struct page *page;
-	int ret;
-
-again:
-	page = find_or_create_page(mapping, index, mask);
-	if (!page)
-		return ERR_PTR(-ENOMEM);
-
-	/*
-	 * Since we can defragment files opened read-only, we can encounter
-	 * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We
-	 * can't do I/O using huge pages yet, so return an error for now.
-	 * Filesystem transparent huge pages are typically only used for
-	 * executables that explicitly enable them, so this isn't very
-	 * restrictive.
-	 */
-	if (PageCompound(page)) {
-		unlock_page(page);
-		put_page(page);
-		return ERR_PTR(-ETXTBSY);
-	}
-
-	ret = set_page_extent_mapped(page);
-	if (ret < 0) {
-		unlock_page(page);
-		put_page(page);
-		return ERR_PTR(ret);
-	}
-
-	/* Wait for any existing ordered extent in the range */
-	while (1) {
-		struct btrfs_ordered_extent *ordered;
-
-		lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
-		ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
-		unlock_extent(&inode->io_tree, page_start, page_end,
-			      &cached_state);
-		if (!ordered)
-			break;
-
-		unlock_page(page);
-		btrfs_start_ordered_extent(ordered, 1);
-		btrfs_put_ordered_extent(ordered);
-		lock_page(page);
-		/*
-		 * We unlocked the page above, so we need check if it was
-		 * released or not.
-		 */
-		if (page->mapping != mapping || !PagePrivate(page)) {
-			unlock_page(page);
-			put_page(page);
-			goto again;
-		}
-	}
-
-	/*
-	 * Now the page range has no ordered extent any more.  Read the page to
-	 * make it uptodate.
-	 */
-	if (!PageUptodate(page)) {
-		btrfs_read_folio(NULL, page_folio(page));
-		lock_page(page);
-		if (page->mapping != mapping || !PagePrivate(page)) {
-			unlock_page(page);
-			put_page(page);
-			goto again;
-		}
-		if (!PageUptodate(page)) {
-			unlock_page(page);
-			put_page(page);
-			return ERR_PTR(-EIO);
-		}
-	}
-	return page;
-}
-
-struct defrag_target_range {
-	struct list_head list;
-	u64 start;
-	u64 len;
-};
-
-/*
- * Collect all valid target extents.
- *
- * @start:	   file offset to lookup
- * @len:	   length to lookup
- * @extent_thresh: file extent size threshold, any extent size >= this value
- *		   will be ignored
- * @newer_than:    only defrag extents newer than this value
- * @do_compress:   whether the defrag is doing compression
- *		   if true, @extent_thresh will be ignored and all regular
- *		   file extents meeting @newer_than will be targets.
- * @locked:	   if the range has already held extent lock
- * @target_list:   list of targets file extents
- */
-static int defrag_collect_targets(struct btrfs_inode *inode,
-				  u64 start, u64 len, u32 extent_thresh,
-				  u64 newer_than, bool do_compress,
-				  bool locked, struct list_head *target_list,
-				  u64 *last_scanned_ret)
-{
-	struct btrfs_fs_info *fs_info = inode->root->fs_info;
-	bool last_is_target = false;
-	u64 cur = start;
-	int ret = 0;
-
-	while (cur < start + len) {
-		struct extent_map *em;
-		struct defrag_target_range *new;
-		bool next_mergeable = true;
-		u64 range_len;
-
-		last_is_target = false;
-		em = defrag_lookup_extent(&inode->vfs_inode, cur,
-					  newer_than, locked);
-		if (!em)
-			break;
-
-		/*
-		 * If the file extent is an inlined one, we may still want to
-		 * defrag it (fallthrough) if it will cause a regular extent.
-		 * This is for users who want to convert inline extents to
-		 * regular ones through max_inline= mount option.
-		 */
-		if (em->block_start == EXTENT_MAP_INLINE &&
-		    em->len <= inode->root->fs_info->max_inline)
-			goto next;
-
-		/* Skip hole/delalloc/preallocated extents */
-		if (em->block_start == EXTENT_MAP_HOLE ||
-		    em->block_start == EXTENT_MAP_DELALLOC ||
-		    test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
-			goto next;
-
-		/* Skip older extent */
-		if (em->generation < newer_than)
-			goto next;
-
-		/* This em is under writeback, no need to defrag */
-		if (em->generation == (u64)-1)
-			goto next;
-
-		/*
-		 * Our start offset might be in the middle of an existing extent
-		 * map, so take that into account.
-		 */
-		range_len = em->len - (cur - em->start);
-		/*
-		 * If this range of the extent map is already flagged for delalloc,
-		 * skip it, because:
-		 *
-		 * 1) We could deadlock later, when trying to reserve space for
-		 *    delalloc, because in case we can't immediately reserve space
-		 *    the flusher can start delalloc and wait for the respective
-		 *    ordered extents to complete. The deadlock would happen
-		 *    because we do the space reservation while holding the range
-		 *    locked, and starting writeback, or finishing an ordered
-		 *    extent, requires locking the range;
-		 *
-		 * 2) If there's delalloc there, it means there's dirty pages for
-		 *    which writeback has not started yet (we clean the delalloc
-		 *    flag when starting writeback and after creating an ordered
-		 *    extent). If we mark pages in an adjacent range for defrag,
-		 *    then we will have a larger contiguous range for delalloc,
-		 *    very likely resulting in a larger extent after writeback is
-		 *    triggered (except in a case of free space fragmentation).
-		 */
-		if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1,
-				   EXTENT_DELALLOC, 0, NULL))
-			goto next;
-
-		/*
-		 * For do_compress case, we want to compress all valid file
-		 * extents, thus no @extent_thresh or mergeable check.
-		 */
-		if (do_compress)
-			goto add;
-
-		/* Skip too large extent */
-		if (range_len >= extent_thresh)
-			goto next;
-
-		/*
-		 * Skip extents already at its max capacity, this is mostly for
-		 * compressed extents, which max cap is only 128K.
-		 */
-		if (em->len >= get_extent_max_capacity(fs_info, em))
-			goto next;
-
-		/*
-		 * Normally there are no more extents after an inline one, thus
-		 * @next_mergeable will normally be false and not defragged.
-		 * So if an inline extent passed all above checks, just add it
-		 * for defrag, and be converted to regular extents.
-		 */
-		if (em->block_start == EXTENT_MAP_INLINE)
-			goto add;
-
-		next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
-						extent_thresh, newer_than, locked);
-		if (!next_mergeable) {
-			struct defrag_target_range *last;
-
-			/* Empty target list, no way to merge with last entry */
-			if (list_empty(target_list))
-				goto next;
-			last = list_entry(target_list->prev,
-					  struct defrag_target_range, list);
-			/* Not mergeable with last entry */
-			if (last->start + last->len != cur)
-				goto next;
-
-			/* Mergeable, fall through to add it to @target_list. */
-		}
-
-add:
-		last_is_target = true;
-		range_len = min(extent_map_end(em), start + len) - cur;
-		/*
-		 * This one is a good target, check if it can be merged into
-		 * last range of the target list.
-		 */
-		if (!list_empty(target_list)) {
-			struct defrag_target_range *last;
-
-			last = list_entry(target_list->prev,
-					  struct defrag_target_range, list);
-			ASSERT(last->start + last->len <= cur);
-			if (last->start + last->len == cur) {
-				/* Mergeable, enlarge the last entry */
-				last->len += range_len;
-				goto next;
-			}
-			/* Fall through to allocate a new entry */
-		}
-
-		/* Allocate new defrag_target_range */
-		new = kmalloc(sizeof(*new), GFP_NOFS);
-		if (!new) {
-			free_extent_map(em);
-			ret = -ENOMEM;
-			break;
-		}
-		new->start = cur;
-		new->len = range_len;
-		list_add_tail(&new->list, target_list);
-
-next:
-		cur = extent_map_end(em);
-		free_extent_map(em);
-	}
-	if (ret < 0) {
-		struct defrag_target_range *entry;
-		struct defrag_target_range *tmp;
-
-		list_for_each_entry_safe(entry, tmp, target_list, list) {
-			list_del_init(&entry->list);
-			kfree(entry);
-		}
-	}
-	if (!ret && last_scanned_ret) {
-		/*
-		 * If the last extent is not a target, the caller can skip to
-		 * the end of that extent.
-		 * Otherwise, we can only go the end of the specified range.
-		 */
-		if (!last_is_target)
-			*last_scanned_ret = max(cur, *last_scanned_ret);
-		else
-			*last_scanned_ret = max(start + len, *last_scanned_ret);
-	}
-	return ret;
-}
-
-#define CLUSTER_SIZE	(SZ_256K)
-static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
-
-/*
- * Defrag one contiguous target range.
- *
- * @inode:	target inode
- * @target:	target range to defrag
- * @pages:	locked pages covering the defrag range
- * @nr_pages:	number of locked pages
- *
- * Caller should ensure:
- *
- * - Pages are prepared
- *   Pages should be locked, no ordered extent in the pages range,
- *   no writeback.
- *
- * - Extent bits are locked
- */
-static int defrag_one_locked_target(struct btrfs_inode *inode,
-				    struct defrag_target_range *target,
-				    struct page **pages, int nr_pages,
-				    struct extent_state **cached_state)
-{
-	struct btrfs_fs_info *fs_info = inode->root->fs_info;
-	struct extent_changeset *data_reserved = NULL;
-	const u64 start = target->start;
-	const u64 len = target->len;
-	unsigned long last_index = (start + len - 1) >> PAGE_SHIFT;
-	unsigned long start_index = start >> PAGE_SHIFT;
-	unsigned long first_index = page_index(pages[0]);
-	int ret = 0;
-	int i;
-
-	ASSERT(last_index - first_index + 1 <= nr_pages);
-
-	ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len);
-	if (ret < 0)
-		return ret;
-	clear_extent_bit(&inode->io_tree, start, start + len - 1,
-			 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
-			 EXTENT_DEFRAG, cached_state);
-	set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state);
-
-	/* Update the page status */
-	for (i = start_index - first_index; i <= last_index - first_index; i++) {
-		ClearPageChecked(pages[i]);
-		btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len);
-	}
-	btrfs_delalloc_release_extents(inode, len);
-	extent_changeset_free(data_reserved);
-
-	return ret;
-}
-
-static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
-			    u32 extent_thresh, u64 newer_than, bool do_compress,
-			    u64 *last_scanned_ret)
-{
-	struct extent_state *cached_state = NULL;
-	struct defrag_target_range *entry;
-	struct defrag_target_range *tmp;
-	LIST_HEAD(target_list);
-	struct page **pages;
-	const u32 sectorsize = inode->root->fs_info->sectorsize;
-	u64 last_index = (start + len - 1) >> PAGE_SHIFT;
-	u64 start_index = start >> PAGE_SHIFT;
-	unsigned int nr_pages = last_index - start_index + 1;
-	int ret = 0;
-	int i;
-
-	ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE);
-	ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize));
-
-	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
-	if (!pages)
-		return -ENOMEM;
-
-	/* Prepare all pages */
-	for (i = 0; i < nr_pages; i++) {
-		pages[i] = defrag_prepare_one_page(inode, start_index + i);
-		if (IS_ERR(pages[i])) {
-			ret = PTR_ERR(pages[i]);
-			pages[i] = NULL;
-			goto free_pages;
-		}
-	}
-	for (i = 0; i < nr_pages; i++)
-		wait_on_page_writeback(pages[i]);
-
-	/* Lock the pages range */
-	lock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
-		    (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
-		    &cached_state);
-	/*
-	 * Now we have a consistent view about the extent map, re-check
-	 * which range really needs to be defragged.
-	 *
-	 * And this time we have extent locked already, pass @locked = true
-	 * so that we won't relock the extent range and cause deadlock.
-	 */
-	ret = defrag_collect_targets(inode, start, len, extent_thresh,
-				     newer_than, do_compress, true,
-				     &target_list, last_scanned_ret);
-	if (ret < 0)
-		goto unlock_extent;
-
-	list_for_each_entry(entry, &target_list, list) {
-		ret = defrag_one_locked_target(inode, entry, pages, nr_pages,
-					       &cached_state);
-		if (ret < 0)
-			break;
-	}
-
-	list_for_each_entry_safe(entry, tmp, &target_list, list) {
-		list_del_init(&entry->list);
-		kfree(entry);
-	}
-unlock_extent:
-	unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
-		      (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
-		      &cached_state);
-free_pages:
-	for (i = 0; i < nr_pages; i++) {
-		if (pages[i]) {
-			unlock_page(pages[i]);
-			put_page(pages[i]);
-		}
-	}
-	kfree(pages);
-	return ret;
-}
-
-static int defrag_one_cluster(struct btrfs_inode *inode,
-			      struct file_ra_state *ra,
-			      u64 start, u32 len, u32 extent_thresh,
-			      u64 newer_than, bool do_compress,
-			      unsigned long *sectors_defragged,
-			      unsigned long max_sectors,
-			      u64 *last_scanned_ret)
-{
-	const u32 sectorsize = inode->root->fs_info->sectorsize;
-	struct defrag_target_range *entry;
-	struct defrag_target_range *tmp;
-	LIST_HEAD(target_list);
-	int ret;
-
-	ret = defrag_collect_targets(inode, start, len, extent_thresh,
-				     newer_than, do_compress, false,
-				     &target_list, NULL);
-	if (ret < 0)
-		goto out;
-
-	list_for_each_entry(entry, &target_list, list) {
-		u32 range_len = entry->len;
-
-		/* Reached or beyond the limit */
-		if (max_sectors && *sectors_defragged >= max_sectors) {
-			ret = 1;
-			break;
-		}
-
-		if (max_sectors)
-			range_len = min_t(u32, range_len,
-				(max_sectors - *sectors_defragged) * sectorsize);
-
-		/*
-		 * If defrag_one_range() has updated last_scanned_ret,
-		 * our range may already be invalid (e.g. hole punched).
-		 * Skip if our range is before last_scanned_ret, as there is
-		 * no need to defrag the range anymore.
-		 */
-		if (entry->start + range_len <= *last_scanned_ret)
-			continue;
-
-		if (ra)
-			page_cache_sync_readahead(inode->vfs_inode.i_mapping,
-				ra, NULL, entry->start >> PAGE_SHIFT,
-				((entry->start + range_len - 1) >> PAGE_SHIFT) -
-				(entry->start >> PAGE_SHIFT) + 1);
-		/*
-		 * Here we may not defrag any range if holes are punched before
-		 * we locked the pages.
-		 * But that's fine, it only affects the @sectors_defragged
-		 * accounting.
-		 */
-		ret = defrag_one_range(inode, entry->start, range_len,
-				       extent_thresh, newer_than, do_compress,
-				       last_scanned_ret);
-		if (ret < 0)
-			break;
-		*sectors_defragged += range_len >>
-				      inode->root->fs_info->sectorsize_bits;
-	}
-out:
-	list_for_each_entry_safe(entry, tmp, &target_list, list) {
-		list_del_init(&entry->list);
-		kfree(entry);
-	}
-	if (ret >= 0)
-		*last_scanned_ret = max(*last_scanned_ret, start + len);
-	return ret;
-}
-
-/*
- * Entry point to file defragmentation.
- *
- * @inode:	   inode to be defragged
- * @ra:		   readahead state (can be NUL)
- * @range:	   defrag options including range and flags
- * @newer_than:	   minimum transid to defrag
- * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
- *		   will be defragged.
- *
- * Return <0 for error.
- * Return >=0 for the number of sectors defragged, and range->start will be updated
- * to indicate the file offset where next defrag should be started at.
- * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without
- *  defragging all the range).
- */
-int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
-		      struct btrfs_ioctl_defrag_range_args *range,
-		      u64 newer_than, unsigned long max_to_defrag)
-{
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-	unsigned long sectors_defragged = 0;
-	u64 isize = i_size_read(inode);
-	u64 cur;
-	u64 last_byte;
-	bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
-	bool ra_allocated = false;
-	int compress_type = BTRFS_COMPRESS_ZLIB;
-	int ret = 0;
-	u32 extent_thresh = range->extent_thresh;
-	pgoff_t start_index;
-
-	if (isize == 0)
-		return 0;
-
-	if (range->start >= isize)
-		return -EINVAL;
-
-	if (do_compress) {
-		if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES)
-			return -EINVAL;
-		if (range->compress_type)
-			compress_type = range->compress_type;
-	}
-
-	if (extent_thresh == 0)
-		extent_thresh = SZ_256K;
-
-	if (range->start + range->len > range->start) {
-		/* Got a specific range */
-		last_byte = min(isize, range->start + range->len);
-	} else {
-		/* Defrag until file end */
-		last_byte = isize;
-	}
-
-	/* Align the range */
-	cur = round_down(range->start, fs_info->sectorsize);
-	last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
-
-	/*
-	 * If we were not given a ra, allocate a readahead context. As
-	 * readahead is just an optimization, defrag will work without it so
-	 * we don't error out.
-	 */
-	if (!ra) {
-		ra_allocated = true;
-		ra = kzalloc(sizeof(*ra), GFP_KERNEL);
-		if (ra)
-			file_ra_state_init(ra, inode->i_mapping);
-	}
-
-	/*
-	 * Make writeback start from the beginning of the range, so that the
-	 * defrag range can be written sequentially.
-	 */
-	start_index = cur >> PAGE_SHIFT;
-	if (start_index < inode->i_mapping->writeback_index)
-		inode->i_mapping->writeback_index = start_index;
-
-	while (cur < last_byte) {
-		const unsigned long prev_sectors_defragged = sectors_defragged;
-		u64 last_scanned = cur;
-		u64 cluster_end;
-
-		if (btrfs_defrag_cancelled(fs_info)) {
-			ret = -EAGAIN;
-			break;
-		}
-
-		/* We want the cluster end at page boundary when possible */
-		cluster_end = (((cur >> PAGE_SHIFT) +
-			       (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1;
-		cluster_end = min(cluster_end, last_byte);
-
-		btrfs_inode_lock(inode, 0);
-		if (IS_SWAPFILE(inode)) {
-			ret = -ETXTBSY;
-			btrfs_inode_unlock(inode, 0);
-			break;
-		}
-		if (!(inode->i_sb->s_flags & SB_ACTIVE)) {
-			btrfs_inode_unlock(inode, 0);
-			break;
-		}
-		if (do_compress)
-			BTRFS_I(inode)->defrag_compress = compress_type;
-		ret = defrag_one_cluster(BTRFS_I(inode), ra, cur,
-				cluster_end + 1 - cur, extent_thresh,
-				newer_than, do_compress, &sectors_defragged,
-				max_to_defrag, &last_scanned);
-
-		if (sectors_defragged > prev_sectors_defragged)
-			balance_dirty_pages_ratelimited(inode->i_mapping);
-
-		btrfs_inode_unlock(inode, 0);
-		if (ret < 0)
-			break;
-		cur = max(cluster_end + 1, last_scanned);
-		if (ret > 0) {
-			ret = 0;
-			break;
-		}
-		cond_resched();
-	}
-
-	if (ra_allocated)
-		kfree(ra);
-	/*
-	 * Update range.start for autodefrag, this will indicate where to start
-	 * in next run.
-	 */
-	range->start = cur;
-	if (sectors_defragged) {
-		/*
-		 * We have defragged some sectors, for compression case they
-		 * need to be written back immediately.
-		 */
-		if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) {
-			filemap_flush(inode->i_mapping);
-			if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
-				     &BTRFS_I(inode)->runtime_flags))
-				filemap_flush(inode->i_mapping);
-		}
-		if (range->compress_type == BTRFS_COMPRESS_LZO)
-			btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
-		else if (range->compress_type == BTRFS_COMPRESS_ZSTD)
-			btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
-		ret = sectors_defragged;
-	}
-	if (do_compress) {
-		btrfs_inode_lock(inode, 0);
-		BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
-		btrfs_inode_unlock(inode, 0);
-	}
-	return ret;
-}
-
 /*
  * Try to start exclusive operation @type or cancel it if it's running.
  *
-- 
cgit 


From 59b818e064ab9051cd344b82420307e772d6bca7 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 26 Oct 2022 15:08:25 -0400
Subject: btrfs: move defrag related prototypes to their own header

Now that the defrag code is all in one file, create a defrag.h and move
all the defrag related prototypes and helper out of ctree.h and into
defrag.h.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h       | 18 ------------------
 fs/btrfs/defrag.c      |  1 +
 fs/btrfs/defrag.h      | 22 ++++++++++++++++++++++
 fs/btrfs/disk-io.c     |  1 +
 fs/btrfs/inode.c       |  1 +
 fs/btrfs/ioctl.c       |  1 +
 fs/btrfs/super.c       |  1 +
 fs/btrfs/transaction.c |  1 +
 8 files changed, 28 insertions(+), 18 deletions(-)
 create mode 100644 fs/btrfs/defrag.h

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a9267b117860..689220637d00 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -773,19 +773,10 @@ int btrfs_fileattr_set(struct user_namespace *mnt_userns,
 int btrfs_ioctl_get_supported_features(void __user *arg);
 void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
 int __pure btrfs_is_empty_uuid(u8 *uuid);
-int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
-		      struct btrfs_ioctl_defrag_range_args *range,
-		      u64 newer_than, unsigned long max_to_defrag);
 void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
 			       struct btrfs_ioctl_balance_args *bargs);
 
 /* file.c */
-int __init btrfs_auto_defrag_init(void);
-void __cold btrfs_auto_defrag_exit(void);
-int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
-			   struct btrfs_inode *inode, u32 extent_thresh);
-int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
-void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
 extern const struct file_operations btrfs_file_operations;
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
@@ -811,10 +802,6 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode);
 bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
 				  u64 *delalloc_start_ret, u64 *delalloc_end_ret);
 
-/* tree-defrag.c */
-int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root);
-
 /* super.c */
 int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 			unsigned long new_flags);
@@ -945,11 +932,6 @@ static inline int is_fstree(u64 rootid)
 	return 0;
 }
 
-static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
-{
-	return signal_pending(current);
-}
-
 /* verity.c */
 #ifdef CONFIG_FS_VERITY
 
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index 2c260eef40d4..291263f93e47 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -13,6 +13,7 @@
 #include "messages.h"
 #include "delalloc-space.h"
 #include "subpage.h"
+#include "defrag.h"
 
 static struct kmem_cache *btrfs_inode_defrag_cachep;
 
diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h
new file mode 100644
index 000000000000..5305f2283b5e
--- /dev/null
+++ b/fs/btrfs/defrag.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_DEFRAG_H
+#define BTRFS_DEFRAG_H
+
+int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
+		      struct btrfs_ioctl_defrag_range_args *range,
+		      u64 newer_than, unsigned long max_to_defrag);
+int __init btrfs_auto_defrag_init(void);
+void __cold btrfs_auto_defrag_exit(void);
+int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
+			   struct btrfs_inode *inode, u32 extent_thresh);
+int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
+void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
+int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+
+static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
+{
+	return signal_pending(current);
+}
+
+#endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ebc9baab0e47..2f7dd79a5a35 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -47,6 +47,7 @@
 #include "accessors.h"
 #include "extent-tree.h"
 #include "root-tree.h"
+#include "defrag.h"
 
 #define BTRFS_SUPER_FLAG_SUPP	(BTRFS_HEADER_FLAG_WRITTEN |\
 				 BTRFS_HEADER_FLAG_RELOC |\
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5b9857e30a08..97f28e37ec7c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -59,6 +59,7 @@
 #include "accessors.h"
 #include "extent-tree.h"
 #include "root-tree.h"
+#include "defrag.h"
 
 struct btrfs_iget_args {
 	u64 ino;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 42453dd2ed31..f91dbd1dc6d1 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -54,6 +54,7 @@
 #include "accessors.h"
 #include "extent-tree.h"
 #include "root-tree.h"
+#include "defrag.h"
 
 #ifdef CONFIG_64BIT
 /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a4030dfeb2f2..f105d360d6c9 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -52,6 +52,7 @@
 #include "raid56.h"
 #include "fs.h"
 #include "accessors.h"
+#include "defrag.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/btrfs.h>
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 82b2e2ec90cf..99d3f2a66227 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -28,6 +28,7 @@
 #include "accessors.h"
 #include "extent-tree.h"
 #include "root-tree.h"
+#include "defrag.h"
 
 static struct kmem_cache *btrfs_trans_handle_cachep;
 
-- 
cgit 


From f2b39277b87dbb26f15e2e3b667df25dad3dd2ea Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 26 Oct 2022 15:08:26 -0400
Subject: btrfs: move dir-item prototypes into dir-item.h

Move these prototypes out of ctree.h and into their own header file.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h       | 38 --------------------------------------
 fs/btrfs/dir-item.c    |  1 +
 fs/btrfs/dir-item.h    | 42 ++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/inode.c       |  1 +
 fs/btrfs/ioctl.c       |  1 +
 fs/btrfs/send.c        |  1 +
 fs/btrfs/super.c       |  1 +
 fs/btrfs/transaction.c |  1 +
 fs/btrfs/tree-log.c    |  1 +
 fs/btrfs/xattr.c       |  1 +
 10 files changed, 50 insertions(+), 38 deletions(-)
 create mode 100644 fs/btrfs/dir-item.h

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 689220637d00..a9666dc95105 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -689,44 +689,6 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
 			u64 subid);
 int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info);
 
-/* dir-item.c */
-int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
-			  const struct fscrypt_str *name);
-int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
-			  const struct fscrypt_str *name, struct btrfs_inode *dir,
-			  struct btrfs_key *location, u8 type, u64 index);
-struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
-					     struct btrfs_root *root,
-					     struct btrfs_path *path, u64 dir,
-					     const struct fscrypt_str *name, int mod);
-struct btrfs_dir_item *
-btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root,
-			    struct btrfs_path *path, u64 dir,
-			    u64 index, const struct fscrypt_str *name, int mod);
-struct btrfs_dir_item *
-btrfs_search_dir_index_item(struct btrfs_root *root,
-			    struct btrfs_path *path, u64 dirid,
-			    const struct fscrypt_str *name);
-int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
-			      struct btrfs_root *root,
-			      struct btrfs_path *path,
-			      struct btrfs_dir_item *di);
-int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root,
-			    struct btrfs_path *path, u64 objectid,
-			    const char *name, u16 name_len,
-			    const void *data, u16 data_len);
-struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
-					  struct btrfs_root *root,
-					  struct btrfs_path *path, u64 dir,
-					  const char *name, u16 name_len,
-					  int mod);
-struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
-						 struct btrfs_path *path,
-						 const char *name,
-						 int name_len);
-
 /* orphan.c */
 int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, u64 offset);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index ca69fb35a2cc..082eb0e19598 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -8,6 +8,7 @@
 #include "disk-io.h"
 #include "transaction.h"
 #include "accessors.h"
+#include "dir-item.h"
 
 /*
  * insert a name into a directory, doing overflow properly if there is a hash
diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h
new file mode 100644
index 000000000000..aab4b7cc7fa0
--- /dev/null
+++ b/fs/btrfs/dir-item.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_DIR_ITEM_H
+#define BTRFS_DIR_ITEM_H
+
+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+			  const struct fscrypt_str *name);
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
+			  const struct fscrypt_str *name, struct btrfs_inode *dir,
+			  struct btrfs_key *location, u8 type, u64 index);
+struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
+					     struct btrfs_root *root,
+					     struct btrfs_path *path, u64 dir,
+					     const struct fscrypt_str *name, int mod);
+struct btrfs_dir_item *btrfs_lookup_dir_index_item(
+			struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			struct btrfs_path *path, u64 dir,
+			u64 index, const struct fscrypt_str *name, int mod);
+struct btrfs_dir_item *btrfs_search_dir_index_item(struct btrfs_root *root,
+			    struct btrfs_path *path, u64 dirid,
+			    const struct fscrypt_str *name);
+int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct btrfs_path *path,
+			      struct btrfs_dir_item *di);
+int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_path *path, u64 objectid,
+			    const char *name, u16 name_len,
+			    const void *data, u16 data_len);
+struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path, u64 dir,
+					  const char *name, u16 name_len,
+					  int mod);
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
+						 struct btrfs_path *path,
+						 const char *name,
+						 int name_len);
+
+#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 97f28e37ec7c..a3130e7c803d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -60,6 +60,7 @@
 #include "extent-tree.h"
 #include "root-tree.h"
 #include "defrag.h"
+#include "dir-item.h"
 
 struct btrfs_iget_args {
 	u64 ino;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f91dbd1dc6d1..41599b068026 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -55,6 +55,7 @@
 #include "extent-tree.h"
 #include "root-tree.h"
 #include "defrag.h"
+#include "dir-item.h"
 
 #ifdef CONFIG_64BIT
 /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 2ece3a030a66..0d3f25d4f147 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -28,6 +28,7 @@
 #include "xattr.h"
 #include "print-tree.h"
 #include "accessors.h"
+#include "dir-item.h"
 
 /*
  * Maximum number of references an extent can have in order for us to attempt to
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f105d360d6c9..9756b0cda626 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -53,6 +53,7 @@
 #include "fs.h"
 #include "accessors.h"
 #include "defrag.h"
+#include "dir-item.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/btrfs.h>
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 99d3f2a66227..a0b7702d980f 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -29,6 +29,7 @@
 #include "extent-tree.h"
 #include "root-tree.h"
 #include "defrag.h"
+#include "dir-item.h"
 
 static struct kmem_cache *btrfs_trans_handle_cachep;
 
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index e358df3668e7..c168db7519e7 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -25,6 +25,7 @@
 #include "accessors.h"
 #include "extent-tree.h"
 #include "root-tree.h"
+#include "dir-item.h"
 
 #define MAX_CONFLICT_INODES 10
 
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index fcf2d5f7e198..0ed4b119a7ca 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -22,6 +22,7 @@
 #include "props.h"
 #include "locking.h"
 #include "accessors.h"
+#include "dir-item.h"
 
 int btrfs_getxattr(struct inode *inode, const char *name,
 				void *buffer, size_t size)
-- 
cgit 


From 7c8ede16280586c72c36af6604985d714b84a32c Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 26 Oct 2022 15:08:27 -0400
Subject: btrfs: move file-item prototypes into their own header

Move these prototypes out of ctree.h and into file-item.h.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c      |  1 +
 fs/btrfs/ctree.h            | 31 -------------------------------
 fs/btrfs/defrag.c           |  1 +
 fs/btrfs/delayed-inode.c    |  1 +
 fs/btrfs/extent-tree.c      |  1 +
 fs/btrfs/extent_io.c        |  1 +
 fs/btrfs/file-item.c        |  1 +
 fs/btrfs/file-item.h        | 35 +++++++++++++++++++++++++++++++++++
 fs/btrfs/file.c             |  1 +
 fs/btrfs/free-space-cache.c |  1 +
 fs/btrfs/inode-item.c       |  1 +
 fs/btrfs/inode.c            |  1 +
 fs/btrfs/reflink.c          |  1 +
 fs/btrfs/relocation.c       |  1 +
 fs/btrfs/scrub.c            |  1 +
 fs/btrfs/send.c             |  1 +
 fs/btrfs/tree-log.c         |  1 +
 17 files changed, 50 insertions(+), 31 deletions(-)
 create mode 100644 fs/btrfs/file-item.h

(limited to 'fs')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index c0615af0434f..61828f8375e6 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -34,6 +34,7 @@
 #include "extent_map.h"
 #include "subpage.h"
 #include "zoned.h"
+#include "file-item.h"
 
 static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a9666dc95105..6bfea55c82a0 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -695,37 +695,6 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
 int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, u64 offset);
 
-/* file-item.c */
-int btrfs_del_csums(struct btrfs_trans_handle *trans,
-		    struct btrfs_root *root, u64 bytenr, u64 len);
-blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst);
-int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *root, u64 objectid, u64 pos,
-			     u64 num_bytes);
-int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *root,
-			     struct btrfs_path *path, u64 objectid,
-			     u64 bytenr, int mod);
-int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root,
-			   struct btrfs_ordered_sum *sums);
-blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
-				u64 offset, bool one_ordered);
-int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
-			     struct list_head *list, int search_commit,
-			     bool nowait);
-void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
-				     const struct btrfs_path *path,
-				     struct btrfs_file_extent_item *fi,
-				     const bool new_inline,
-				     struct extent_map *em);
-int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
-					u64 len);
-int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
-				      u64 len);
-void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size);
-u64 btrfs_file_extent_end(const struct btrfs_path *path);
-
 /* ioctl.c */
 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index 291263f93e47..ed82085acea3 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -14,6 +14,7 @@
 #include "delalloc-space.h"
 #include "subpage.h"
 #include "defrag.h"
+#include "file-item.h"
 
 static struct kmem_cache *btrfs_inode_defrag_cachep;
 
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index f93d2695e423..c024f97de9e0 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -18,6 +18,7 @@
 #include "inode-item.h"
 #include "space-info.h"
 #include "accessors.h"
+#include "file-item.h"
 
 #define BTRFS_DELAYED_WRITEBACK		512
 #define BTRFS_DELAYED_BACKGROUND	128
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 510be00da406..940d4fe23cfb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -40,6 +40,7 @@
 #include "accessors.h"
 #include "extent-tree.h"
 #include "root-tree.h"
+#include "file-item.h"
 
 #undef SCRAMBLE_DELAYED_REFS
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 545d9e1f0f83..ea31a326ae93 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -32,6 +32,7 @@
 #include "compression.h"
 #include "fs.h"
 #include "accessors.h"
+#include "file-item.h"
 
 static struct kmem_cache *extent_buffer_cache;
 
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 403a857d230e..20d88cd0b602 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -19,6 +19,7 @@
 #include "compression.h"
 #include "fs.h"
 #include "accessors.h"
+#include "file-item.h"
 
 #define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
 				   sizeof(struct btrfs_item) * 2) / \
diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h
new file mode 100644
index 000000000000..51cd3ab5948c
--- /dev/null
+++ b/fs/btrfs/file-item.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_FILE_ITEM_H
+#define BTRFS_FILE_ITEM_H
+
+int btrfs_del_csums(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, u64 bytenr, u64 len);
+blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst);
+int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, u64 objectid, u64 pos,
+			     u64 num_bytes);
+int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     struct btrfs_path *path, u64 objectid,
+			     u64 bytenr, int mod);
+int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   struct btrfs_ordered_sum *sums);
+blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
+				u64 offset, bool one_ordered);
+int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
+			     struct list_head *list, int search_commit,
+			     bool nowait);
+void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
+				     const struct btrfs_path *path,
+				     struct btrfs_file_extent_item *fi,
+				     const bool new_inline,
+				     struct extent_map *em);
+int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
+					u64 len);
+int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, u64 len);
+void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size);
+u64 btrfs_file_extent_end(const struct btrfs_path *path);
+
+#endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 8500472aa6ef..9c100198dd26 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -33,6 +33,7 @@
 #include "fs.h"
 #include "accessors.h"
 #include "extent-tree.h"
+#include "file-item.h"
 
 /* simple helper to fault in pages and copy.  This should go away
  * and be replaced with calls into generic code.
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 599b41523479..bc1b9aa164ec 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -27,6 +27,7 @@
 #include "subpage.h"
 #include "inode-item.h"
 #include "accessors.h"
+#include "file-item.h"
 
 #define BITS_PER_BITMAP		(PAGE_SIZE * 8UL)
 #define MAX_CACHE_BYTES_PER_GIG	SZ_64K
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 724507ce7a7d..b65c45b5d681 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -13,6 +13,7 @@
 #include "space-info.h"
 #include "accessors.h"
 #include "extent-tree.h"
+#include "file-item.h"
 
 struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
 						   int slot,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a3130e7c803d..7231f6b69096 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -61,6 +61,7 @@
 #include "root-tree.h"
 #include "defrag.h"
 #include "dir-item.h"
+#include "file-item.h"
 
 struct btrfs_iget_args {
 	u64 ino;
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 9ba7914c7169..204bb884ab91 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -12,6 +12,7 @@
 #include "transaction.h"
 #include "subpage.h"
 #include "accessors.h"
+#include "file-item.h"
 
 #define BTRFS_MAX_DEDUPE_LEN	SZ_16M
 
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index e345cc71da15..e86364bdac8e 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -32,6 +32,7 @@
 #include "accessors.h"
 #include "extent-tree.h"
 #include "root-tree.h"
+#include "file-item.h"
 
 /*
  * Relocation overview
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 3aed760e1a80..e46e6c4d4bf9 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -23,6 +23,7 @@
 #include "zoned.h"
 #include "fs.h"
 #include "accessors.h"
+#include "file-item.h"
 
 /*
  * This is only the first step towards a full-features scrub. It reads all
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 0d3f25d4f147..4cc9e855a769 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -29,6 +29,7 @@
 #include "print-tree.h"
 #include "accessors.h"
 #include "dir-item.h"
+#include "file-item.h"
 
 /*
  * Maximum number of references an extent can have in order for us to attempt to
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c168db7519e7..7be540fb5c4a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -26,6 +26,7 @@
 #include "extent-tree.h"
 #include "root-tree.h"
 #include "dir-item.h"
+#include "file-item.h"
 
 #define MAX_CONFLICT_INODES 10
 
-- 
cgit 


From c7a03b524d30382ed45883cd27906678e6a0dada Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 26 Oct 2022 15:08:28 -0400
Subject: btrfs: move uuid tree prototypes to uuid-tree.h

Move these out of ctree.h into uuid-tree.h to cut down on the code in
ctree.h.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h       |  7 -------
 fs/btrfs/disk-io.c     |  1 +
 fs/btrfs/inode.c       |  1 +
 fs/btrfs/ioctl.c       |  1 +
 fs/btrfs/transaction.c |  1 +
 fs/btrfs/uuid-tree.c   |  1 +
 fs/btrfs/uuid-tree.h   | 12 ++++++++++++
 fs/btrfs/volumes.c     |  1 +
 8 files changed, 18 insertions(+), 7 deletions(-)
 create mode 100644 fs/btrfs/uuid-tree.h

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6bfea55c82a0..910a23e7cd8f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -682,13 +682,6 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 			struct extent_buffer *node,
 			struct extent_buffer *parent);
 
-/* uuid-tree.c */
-int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
-			u64 subid);
-int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
-			u64 subid);
-int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info);
-
 /* orphan.c */
 int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, u64 offset);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2f7dd79a5a35..e8082e3c5bdc 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -48,6 +48,7 @@
 #include "extent-tree.h"
 #include "root-tree.h"
 #include "defrag.h"
+#include "uuid-tree.h"
 
 #define BTRFS_SUPER_FLAG_SUPP	(BTRFS_HEADER_FLAG_WRITTEN |\
 				 BTRFS_HEADER_FLAG_RELOC |\
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7231f6b69096..061fa0923772 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -62,6 +62,7 @@
 #include "defrag.h"
 #include "dir-item.h"
 #include "file-item.h"
+#include "uuid-tree.h"
 
 struct btrfs_iget_args {
 	u64 ino;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 41599b068026..c2439aebebd4 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -56,6 +56,7 @@
 #include "root-tree.h"
 #include "defrag.h"
 #include "dir-item.h"
+#include "uuid-tree.h"
 
 #ifdef CONFIG_64BIT
 /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a0b7702d980f..c6fe2ab8716b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -30,6 +30,7 @@
 #include "root-tree.h"
 #include "defrag.h"
 #include "dir-item.h"
+#include "uuid-tree.h"
 
 static struct kmem_cache *btrfs_trans_handle_cachep;
 
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index 70304b89f31f..7c7001f42b14 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -12,6 +12,7 @@
 #include "print-tree.h"
 #include "fs.h"
 #include "accessors.h"
+#include "uuid-tree.h"
 
 static void btrfs_uuid_to_key(u8 *uuid, u8 type, struct btrfs_key *key)
 {
diff --git a/fs/btrfs/uuid-tree.h b/fs/btrfs/uuid-tree.h
new file mode 100644
index 000000000000..5350c87fe2ca
--- /dev/null
+++ b/fs/btrfs/uuid-tree.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_UUID_TREE_H
+#define BTRFS_UUID_TREE_H
+
+int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
+			u64 subid);
+int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
+			u64 subid);
+int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info);
+
+#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f61bb79d4a7e..55164b2dba2a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -35,6 +35,7 @@
 #include "zoned.h"
 #include "fs.h"
 #include "accessors.h"
+#include "uuid-tree.h"
 
 static struct bio_set btrfs_bioset;
 
-- 
cgit 


From 7572dec8f5223186ed0fa7f6da47dba98651cbf4 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 26 Oct 2022 15:08:29 -0400
Subject: btrfs: move ioctl prototypes into ioctl.h

Move these out of ctree.h into ioctl.h to cut down on code in ctree.h.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h       | 12 ------------
 fs/btrfs/file.c        |  1 +
 fs/btrfs/inode.c       |  1 +
 fs/btrfs/ioctl.c       |  1 +
 fs/btrfs/ioctl.h       | 17 +++++++++++++++++
 fs/btrfs/send.c        |  1 +
 fs/btrfs/super.c       |  1 +
 fs/btrfs/transaction.c |  1 +
 fs/btrfs/verity.c      |  1 +
 fs/btrfs/volumes.c     |  1 +
 10 files changed, 25 insertions(+), 12 deletions(-)
 create mode 100644 fs/btrfs/ioctl.h

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 910a23e7cd8f..87930337b301 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -688,18 +688,6 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
 int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, u64 offset);
 
-/* ioctl.c */
-long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
-long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
-int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
-int btrfs_fileattr_set(struct user_namespace *mnt_userns,
-		       struct dentry *dentry, struct fileattr *fa);
-int btrfs_ioctl_get_supported_features(void __user *arg);
-void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
-int __pure btrfs_is_empty_uuid(u8 *uuid);
-void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
-			       struct btrfs_ioctl_balance_args *bargs);
-
 /* file.c */
 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
 extern const struct file_operations btrfs_file_operations;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9c100198dd26..22063340a4c2 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -34,6 +34,7 @@
 #include "accessors.h"
 #include "extent-tree.h"
 #include "file-item.h"
+#include "ioctl.h"
 
 /* simple helper to fault in pages and copy.  This should go away
  * and be replaced with calls into generic code.
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 061fa0923772..7111eec812dc 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -63,6 +63,7 @@
 #include "dir-item.h"
 #include "file-item.h"
 #include "uuid-tree.h"
+#include "ioctl.h"
 
 struct btrfs_iget_args {
 	u64 ino;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c2439aebebd4..4c5c2f890adc 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -57,6 +57,7 @@
 #include "defrag.h"
 #include "dir-item.h"
 #include "uuid-tree.h"
+#include "ioctl.h"
 
 #ifdef CONFIG_64BIT
 /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
new file mode 100644
index 000000000000..8a855d5ac2fa
--- /dev/null
+++ b/fs/btrfs/ioctl.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_IOCTL_H
+#define BTRFS_IOCTL_H
+
+long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int btrfs_fileattr_set(struct user_namespace *mnt_userns,
+		       struct dentry *dentry, struct fileattr *fa);
+int btrfs_ioctl_get_supported_features(void __user *arg);
+void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
+int __pure btrfs_is_empty_uuid(u8 *uuid);
+void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
+				     struct btrfs_ioctl_balance_args *bargs);
+
+#endif
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 4cc9e855a769..e9b7deccc5fe 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -30,6 +30,7 @@
 #include "accessors.h"
 #include "dir-item.h"
 #include "file-item.h"
+#include "ioctl.h"
 
 /*
  * Maximum number of references an extent can have in order for us to attempt to
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9756b0cda626..2cd05246bcd3 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -54,6 +54,7 @@
 #include "accessors.h"
 #include "defrag.h"
 #include "dir-item.h"
+#include "ioctl.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/btrfs.h>
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c6fe2ab8716b..bbc3ad3e297c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -31,6 +31,7 @@
 #include "defrag.h"
 #include "dir-item.h"
 #include "uuid-tree.h"
+#include "ioctl.h"
 
 static struct kmem_cache *btrfs_trans_handle_cachep;
 
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index d02fa354fc2b..00ba5143a17d 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -18,6 +18,7 @@
 #include "locking.h"
 #include "fs.h"
 #include "accessors.h"
+#include "ioctl.h"
 
 /*
  * Implementation of the interface defined in struct fsverity_operations.
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 55164b2dba2a..823de2f29289 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -36,6 +36,7 @@
 #include "fs.h"
 #include "accessors.h"
 #include "uuid-tree.h"
+#include "ioctl.h"
 
 static struct bio_set btrfs_bioset;
 
-- 
cgit 


From af142b6f44d36c4d0e1e53acbedbc30a588c58de Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 26 Oct 2022 15:08:30 -0400
Subject: btrfs: move file prototypes to file.h

Move these out of ctree.h into file.h to cut down on code in ctree.h.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h            | 26 --------------------------
 fs/btrfs/extent_io.c        |  1 +
 fs/btrfs/file.c             |  1 +
 fs/btrfs/file.h             | 32 ++++++++++++++++++++++++++++++++
 fs/btrfs/free-space-cache.c |  1 +
 fs/btrfs/inode.c            |  1 +
 fs/btrfs/ioctl.c            |  1 +
 fs/btrfs/ordered-data.c     |  1 +
 fs/btrfs/reflink.c          |  1 +
 fs/btrfs/tree-log.c         |  1 +
 10 files changed, 40 insertions(+), 26 deletions(-)
 create mode 100644 fs/btrfs/file.h

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 87930337b301..3a46b5b688e3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -688,32 +688,6 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
 int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, u64 offset);
 
-/* file.c */
-int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
-extern const struct file_operations btrfs_file_operations;
-int btrfs_drop_extents(struct btrfs_trans_handle *trans,
-		       struct btrfs_root *root, struct btrfs_inode *inode,
-		       struct btrfs_drop_extents_args *args);
-int btrfs_replace_file_extents(struct btrfs_inode *inode,
-			   struct btrfs_path *path, const u64 start,
-			   const u64 end,
-			   struct btrfs_replace_extent_info *extent_info,
-			   struct btrfs_trans_handle **trans_out);
-int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
-			      struct btrfs_inode *inode, u64 start, u64 end);
-ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
-			    const struct btrfs_ioctl_encoded_io_args *encoded);
-int btrfs_release_file(struct inode *inode, struct file *file);
-int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
-		      size_t num_pages, loff_t pos, size_t write_bytes,
-		      struct extent_state **cached, bool noreserve);
-int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
-int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
-			   size_t *write_bytes, bool nowait);
-void btrfs_check_nocow_unlock(struct btrfs_inode *inode);
-bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
-				  u64 *delalloc_start_ret, u64 *delalloc_end_ret);
-
 /* super.c */
 int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 			unsigned long new_flags);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ea31a326ae93..7fe637408e98 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -33,6 +33,7 @@
 #include "fs.h"
 #include "accessors.h"
 #include "file-item.h"
+#include "file.h"
 
 static struct kmem_cache *extent_buffer_cache;
 
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 22063340a4c2..b94dc4b2c486 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -35,6 +35,7 @@
 #include "extent-tree.h"
 #include "file-item.h"
 #include "ioctl.h"
+#include "file.h"
 
 /* simple helper to fault in pages and copy.  This should go away
  * and be replaced with calls into generic code.
diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h
new file mode 100644
index 000000000000..f3d794e33d46
--- /dev/null
+++ b/fs/btrfs/file.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_FILE_H
+#define BTRFS_FILE_H
+
+extern const struct file_operations btrfs_file_operations;
+
+int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
+int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, struct btrfs_inode *inode,
+		       struct btrfs_drop_extents_args *args);
+int btrfs_replace_file_extents(struct btrfs_inode *inode,
+			   struct btrfs_path *path, const u64 start,
+			   const u64 end,
+			   struct btrfs_replace_extent_info *extent_info,
+			   struct btrfs_trans_handle **trans_out);
+int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
+			      struct btrfs_inode *inode, u64 start, u64 end);
+ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
+			    const struct btrfs_ioctl_encoded_io_args *encoded);
+int btrfs_release_file(struct inode *inode, struct file *file);
+int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
+		      size_t num_pages, loff_t pos, size_t write_bytes,
+		      struct extent_state **cached, bool noreserve);
+int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
+int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
+			   size_t *write_bytes, bool nowait);
+void btrfs_check_nocow_unlock(struct btrfs_inode *inode);
+bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
+				  u64 *delalloc_start_ret, u64 *delalloc_end_ret);
+
+#endif
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index bc1b9aa164ec..aef075b63188 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -28,6 +28,7 @@
 #include "inode-item.h"
 #include "accessors.h"
 #include "file-item.h"
+#include "file.h"
 
 #define BITS_PER_BITMAP		(PAGE_SIZE * 8UL)
 #define MAX_CACHE_BYTES_PER_GIG	SZ_64K
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7111eec812dc..3fe3301b88b3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -64,6 +64,7 @@
 #include "file-item.h"
 #include "uuid-tree.h"
 #include "ioctl.h"
+#include "file.h"
 
 struct btrfs_iget_args {
 	u64 ino;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4c5c2f890adc..dba1f07b4194 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -58,6 +58,7 @@
 #include "dir-item.h"
 #include "uuid-tree.h"
 #include "ioctl.h"
+#include "file.h"
 
 #ifdef CONFIG_64BIT
 /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 1c36407803ca..2aea2a17ed95 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -18,6 +18,7 @@
 #include "delalloc-space.h"
 #include "qgroup.h"
 #include "subpage.h"
+#include "file.h"
 
 static struct kmem_cache *btrfs_ordered_extent_cache;
 
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 204bb884ab91..3c962b5d8dbd 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -13,6 +13,7 @@
 #include "subpage.h"
 #include "accessors.h"
 #include "file-item.h"
+#include "file.h"
 
 #define BTRFS_MAX_DEDUPE_LEN	SZ_16M
 
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 7be540fb5c4a..1c505713511c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -27,6 +27,7 @@
 #include "root-tree.h"
 #include "dir-item.h"
 #include "file-item.h"
+#include "file.h"
 
 #define MAX_CONFLICT_INODES 10
 
-- 
cgit 


From b538a271ae9bd07e926b1e3bbfcd1aebf63e5860 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 26 Oct 2022 15:08:31 -0400
Subject: btrfs: move the 32bit warn defines into messages.h

The code for these functions are in messages.c, move the defines and
prototypes to messages.h.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h    | 13 -------------
 fs/btrfs/messages.h | 13 +++++++++++++
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3a46b5b688e3..ae2af0aeebc4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -695,19 +695,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait);
 char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
 					  u64 subvol_objectid);
 
-#if BITS_PER_LONG == 32
-#define BTRFS_32BIT_MAX_FILE_SIZE (((u64)ULONG_MAX + 1) << PAGE_SHIFT)
-/*
- * The warning threshold is 5/8th of the MAX_LFS_FILESIZE that limits the logical
- * addresses of extents.
- *
- * For 4K page size it's about 10T, for 64K it's 160T.
- */
-#define BTRFS_32BIT_EARLY_WARN_THRESHOLD (BTRFS_32BIT_MAX_FILE_SIZE * 5 / 8)
-void btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info);
-void btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info);
-#endif
-
 /*
  * Get the correct offset inside the page of extent buffer.
  *
diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h
index 3772358f8d30..5ac410e313e4 100644
--- a/fs/btrfs/messages.h
+++ b/fs/btrfs/messages.h
@@ -229,4 +229,17 @@ do {									\
 	BUG();								\
 } while (0)
 
+#if BITS_PER_LONG == 32
+#define BTRFS_32BIT_MAX_FILE_SIZE (((u64)ULONG_MAX + 1) << PAGE_SHIFT)
+/*
+ * The warning threshold is 5/8th of the MAX_LFS_FILESIZE that limits the logical
+ * addresses of extents.
+ *
+ * For 4K page size it's about 10T, for 64K it's 160T.
+ */
+#define BTRFS_32BIT_EARLY_WARN_THRESHOLD (BTRFS_32BIT_MAX_FILE_SIZE * 5 / 8)
+void btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info);
+void btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info);
+#endif
+
 #endif
-- 
cgit 


From cc68414c6123fcc436c96d8920f48120b0ca2613 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 26 Oct 2022 15:08:32 -0400
Subject: btrfs: move the snapshot drop related prototypes to extent-tree.h

These belong in extent-tree.h, they were missed because they were not
grouped with the other extent-tree.c prototypes.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h       | 6 ------
 fs/btrfs/extent-tree.h | 6 ++++++
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ae2af0aeebc4..dcbfb1b9d269 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -675,12 +675,6 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
 	return btrfs_next_old_item(root, p, 0);
 }
 int btrfs_leaf_free_space(struct extent_buffer *leaf);
-int __must_check btrfs_drop_snapshot(struct btrfs_root *root, int update_ref,
-				     int for_reloc);
-int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root,
-			struct extent_buffer *node,
-			struct extent_buffer *parent);
 
 /* orphan.c */
 int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index b3674b008d58..ae5425253603 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -68,5 +68,11 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
 int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, u64 len);
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref);
+int __must_check btrfs_drop_snapshot(struct btrfs_root *root, int update_ref,
+				     int for_reloc);
+int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			struct extent_buffer *node,
+			struct extent_buffer *parent);
 
 #endif
-- 
cgit 


From 33cf97a7b6585efc57c1277b9e346afa7c542999 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 26 Oct 2022 15:08:33 -0400
Subject: btrfs: move acl prototypes into acl.h

Move these out of ctree.h into acl.h to cut down on code in ctree.h.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/acl.c   |  2 +-
 fs/btrfs/acl.h   | 27 +++++++++++++++++++++++++++
 fs/btrfs/ctree.h | 18 ------------------
 fs/btrfs/inode.c |  1 +
 4 files changed, 29 insertions(+), 19 deletions(-)
 create mode 100644 fs/btrfs/acl.h

(limited to 'fs')

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 548d6a5477b4..100bae33c677 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -11,10 +11,10 @@
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
 #include <linux/slab.h>
-
 #include "ctree.h"
 #include "btrfs_inode.h"
 #include "xattr.h"
+#include "acl.h"
 
 struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu)
 {
diff --git a/fs/btrfs/acl.h b/fs/btrfs/acl.h
new file mode 100644
index 000000000000..45197b4f73bf
--- /dev/null
+++ b/fs/btrfs/acl.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_ACL_H
+#define BTRFS_ACL_H
+
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+
+struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu);
+int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		  struct posix_acl *acl, int type);
+int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
+		    struct posix_acl *acl, int type);
+
+#else
+
+#define btrfs_get_acl NULL
+#define btrfs_set_acl NULL
+static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans,
+				  struct inode *inode, struct posix_acl *acl,
+				  int type)
+{
+	return -EOPNOTSUPP;
+}
+
+#endif
+
+#endif
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index dcbfb1b9d269..040b640b0222 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -732,24 +732,6 @@ static inline unsigned long get_eb_page_index(unsigned long offset)
 #define EXPORT_FOR_TESTS
 #endif
 
-/* acl.c */
-#ifdef CONFIG_BTRFS_FS_POSIX_ACL
-struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu);
-int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
-		  struct posix_acl *acl, int type);
-int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
-		    struct posix_acl *acl, int type);
-#else
-#define btrfs_get_acl NULL
-#define btrfs_set_acl NULL
-static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans,
-				  struct inode *inode, struct posix_acl *acl,
-				  int type)
-{
-	return -EOPNOTSUPP;
-}
-#endif
-
 /* relocation.c */
 int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start);
 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3fe3301b88b3..cb038dbbca7d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -65,6 +65,7 @@
 #include "uuid-tree.h"
 #include "ioctl.h"
 #include "file.h"
+#include "acl.h"
 
 struct btrfs_iget_args {
 	u64 ino;
-- 
cgit 


From 677074792a1d533232ec5517f23f78d64e6dffac Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 26 Oct 2022 15:08:34 -0400
Subject: btrfs: move relocation prototypes into relocation.h

Move these out of ctree.h into relocation.h to cut down on code in
ctree.h

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c     |  1 +
 fs/btrfs/ctree.c       |  1 +
 fs/btrfs/ctree.h       | 20 --------------------
 fs/btrfs/disk-io.c     |  1 +
 fs/btrfs/inode.c       |  1 +
 fs/btrfs/relocation.c  |  1 +
 fs/btrfs/relocation.h  | 23 +++++++++++++++++++++++
 fs/btrfs/transaction.c |  1 +
 fs/btrfs/volumes.c     |  1 +
 9 files changed, 30 insertions(+), 20 deletions(-)
 create mode 100644 fs/btrfs/relocation.h

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 173df40da984..04cb608e7cfb 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -18,6 +18,7 @@
 #include "fs.h"
 #include "accessors.h"
 #include "extent-tree.h"
+#include "relocation.h"
 
 /* Just arbitrary numbers so we can be sure one of these happened. */
 #define BACKREF_FOUND_SHARED     6
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 4b47d380da1e..0acd85111cdf 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -21,6 +21,7 @@
 #include "fs.h"
 #include "accessors.h"
 #include "extent-tree.h"
+#include "relocation.h"
 
 static struct kmem_cache *btrfs_path_cachep;
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 040b640b0222..b1b6de508e20 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -732,26 +732,6 @@ static inline unsigned long get_eb_page_index(unsigned long offset)
 #define EXPORT_FOR_TESTS
 #endif
 
-/* relocation.c */
-int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start);
-int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root);
-int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root);
-int btrfs_recover_relocation(struct btrfs_fs_info *fs_info);
-int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len);
-int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root, struct extent_buffer *buf,
-			  struct extent_buffer *cow);
-void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
-			      u64 *bytes_to_reserve);
-int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
-			      struct btrfs_pending_snapshot *pending);
-int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info);
-struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info,
-				   u64 bytenr);
-int btrfs_should_ignore_reloc_root(struct btrfs_root *root);
-
 /* scrub.c */
 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 		    u64 end, struct btrfs_scrub_progress *progress,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e8082e3c5bdc..a613a9a7caae 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -49,6 +49,7 @@
 #include "root-tree.h"
 #include "defrag.h"
 #include "uuid-tree.h"
+#include "relocation.h"
 
 #define BTRFS_SUPER_FLAG_SUPP	(BTRFS_HEADER_FLAG_WRITTEN |\
 				 BTRFS_HEADER_FLAG_RELOC |\
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index cb038dbbca7d..d8856e621e01 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -66,6 +66,7 @@
 #include "ioctl.h"
 #include "file.h"
 #include "acl.h"
+#include "relocation.h"
 
 struct btrfs_iget_args {
 	u64 ino;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index e86364bdac8e..f31a97d4f9ad 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -33,6 +33,7 @@
 #include "extent-tree.h"
 #include "root-tree.h"
 #include "file-item.h"
+#include "relocation.h"
 
 /*
  * Relocation overview
diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h
new file mode 100644
index 000000000000..2041a86186de
--- /dev/null
+++ b/fs/btrfs/relocation.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_RELOCATION_H
+#define BTRFS_RELOCATION_H
+
+int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start);
+int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root);
+int btrfs_recover_relocation(struct btrfs_fs_info *fs_info);
+int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len);
+int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct extent_buffer *buf,
+			  struct extent_buffer *cow);
+void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
+			      u64 *bytes_to_reserve);
+int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
+			      struct btrfs_pending_snapshot *pending);
+int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info);
+struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr);
+int btrfs_should_ignore_reloc_root(struct btrfs_root *root);
+
+#endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index bbc3ad3e297c..a262fd188abe 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -32,6 +32,7 @@
 #include "dir-item.h"
 #include "uuid-tree.h"
 #include "ioctl.h"
+#include "relocation.h"
 
 static struct kmem_cache *btrfs_trans_handle_cachep;
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 823de2f29289..abfa6a9e20e6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -37,6 +37,7 @@
 #include "accessors.h"
 #include "uuid-tree.h"
 #include "ioctl.h"
+#include "relocation.h"
 
 static struct bio_set btrfs_bioset;
 
-- 
cgit 


From 2fc6822c99d7e902b7cef146efa37420d41c0c59 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 26 Oct 2022 15:08:35 -0400
Subject: btrfs: move scrub prototypes into scrub.h

Move these out of ctree.h into scrub.h to cut down on code in ctree.h.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h       | 11 -----------
 fs/btrfs/dev-replace.c |  1 +
 fs/btrfs/disk-io.c     |  1 +
 fs/btrfs/ioctl.c       |  1 +
 fs/btrfs/scrub.c       |  1 +
 fs/btrfs/scrub.h       | 16 ++++++++++++++++
 fs/btrfs/super.c       |  1 +
 fs/btrfs/transaction.c |  1 +
 fs/btrfs/volumes.c     |  1 +
 9 files changed, 23 insertions(+), 11 deletions(-)
 create mode 100644 fs/btrfs/scrub.h

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b1b6de508e20..f3facc10646c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -732,17 +732,6 @@ static inline unsigned long get_eb_page_index(unsigned long offset)
 #define EXPORT_FOR_TESTS
 #endif
 
-/* scrub.c */
-int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
-		    u64 end, struct btrfs_scrub_progress *progress,
-		    int readonly, int is_dev_replace);
-void btrfs_scrub_pause(struct btrfs_fs_info *fs_info);
-void btrfs_scrub_continue(struct btrfs_fs_info *fs_info);
-int btrfs_scrub_cancel(struct btrfs_fs_info *info);
-int btrfs_scrub_cancel_dev(struct btrfs_device *dev);
-int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
-			 struct btrfs_scrub_progress *progress);
-
 /* dev-replace.c */
 void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
 void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 94f8975034ce..84af2010fae2 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -25,6 +25,7 @@
 #include "block-group.h"
 #include "fs.h"
 #include "accessors.h"
+#include "scrub.h"
 
 /*
  * Device replace overview
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a613a9a7caae..880bd39680bc 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -50,6 +50,7 @@
 #include "defrag.h"
 #include "uuid-tree.h"
 #include "relocation.h"
+#include "scrub.h"
 
 #define BTRFS_SUPER_FLAG_SUPP	(BTRFS_HEADER_FLAG_WRITTEN |\
 				 BTRFS_HEADER_FLAG_RELOC |\
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index dba1f07b4194..98cf16f118b5 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -59,6 +59,7 @@
 #include "uuid-tree.h"
 #include "ioctl.h"
 #include "file.h"
+#include "scrub.h"
 
 #ifdef CONFIG_64BIT
 /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index e46e6c4d4bf9..8c89e63ef2e8 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -24,6 +24,7 @@
 #include "fs.h"
 #include "accessors.h"
 #include "file-item.h"
+#include "scrub.h"
 
 /*
  * This is only the first step towards a full-features scrub. It reads all
diff --git a/fs/btrfs/scrub.h b/fs/btrfs/scrub.h
new file mode 100644
index 000000000000..7639103ebf9d
--- /dev/null
+++ b/fs/btrfs/scrub.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_SCRUB_H
+#define BTRFS_SCRUB_H
+
+int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
+		    u64 end, struct btrfs_scrub_progress *progress,
+		    int readonly, int is_dev_replace);
+void btrfs_scrub_pause(struct btrfs_fs_info *fs_info);
+void btrfs_scrub_continue(struct btrfs_fs_info *fs_info);
+int btrfs_scrub_cancel(struct btrfs_fs_info *info);
+int btrfs_scrub_cancel_dev(struct btrfs_device *dev);
+int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
+			 struct btrfs_scrub_progress *progress);
+
+#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2cd05246bcd3..4bfda9be4556 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -55,6 +55,7 @@
 #include "defrag.h"
 #include "dir-item.h"
 #include "ioctl.h"
+#include "scrub.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/btrfs.h>
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a262fd188abe..593b2414e2be 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -33,6 +33,7 @@
 #include "uuid-tree.h"
 #include "ioctl.h"
 #include "relocation.h"
+#include "scrub.h"
 
 static struct kmem_cache *btrfs_trans_handle_cachep;
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index abfa6a9e20e6..5606060770eb 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -38,6 +38,7 @@
 #include "uuid-tree.h"
 #include "ioctl.h"
 #include "relocation.h"
+#include "scrub.h"
 
 static struct bio_set btrfs_bioset;
 
-- 
cgit 


From 77407dc032e29f205a1e3922564c0abfd2e23d15 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 26 Oct 2022 15:08:36 -0400
Subject: btrfs: move dev-replace prototypes into dev-replace.h

We already have a dev-replace.h, simply move these prototypes and
helpers into dev-replace.h where they belong.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h       | 9 ---------
 fs/btrfs/dev-replace.h | 8 ++++++++
 fs/btrfs/extent_io.c   | 1 +
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f3facc10646c..84bc33ff003f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -732,15 +732,6 @@ static inline unsigned long get_eb_page_index(unsigned long offset)
 #define EXPORT_FOR_TESTS
 #endif
 
-/* dev-replace.c */
-void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
-void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount);
-
-static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
-{
-	btrfs_bio_counter_sub(fs_info, 1);
-}
-
 static inline int is_fstree(u64 rootid)
 {
 	if (rootid == BTRFS_FS_TREE_OBJECTID ||
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index 6084b313056a..675082ccec89 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -25,5 +25,13 @@ int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
 bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
 				      struct btrfs_block_group *cache,
 				      u64 physical);
+void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
+void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount);
+
+static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
+{
+	btrfs_bio_counter_sub(fs_info, 1);
+}
+
 
 #endif
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7fe637408e98..6373b1565250 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -34,6 +34,7 @@
 #include "accessors.h"
 #include "file-item.h"
 #include "file.h"
+#include "dev-replace.h"
 
 static struct kmem_cache *extent_buffer_cache;
 
-- 
cgit 


From 5c11adcc383a94cacd652461c9435bf7a5c53c9c Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 26 Oct 2022 15:08:37 -0400
Subject: btrfs: move verity prototypes into verity.h

Move these out of ctree.h into verity.h to cut down on code in ctree.h.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h  | 22 ----------------------
 fs/btrfs/inode.c  |  1 +
 fs/btrfs/send.c   |  1 +
 fs/btrfs/super.c  |  1 +
 fs/btrfs/verity.c |  1 +
 fs/btrfs/verity.h | 28 ++++++++++++++++++++++++++++
 6 files changed, 32 insertions(+), 22 deletions(-)
 create mode 100644 fs/btrfs/verity.h

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 84bc33ff003f..15bb90536460 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -741,28 +741,6 @@ static inline int is_fstree(u64 rootid)
 	return 0;
 }
 
-/* verity.c */
-#ifdef CONFIG_FS_VERITY
-
-extern const struct fsverity_operations btrfs_verityops;
-int btrfs_drop_verity_items(struct btrfs_inode *inode);
-int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size);
-
-#else
-
-static inline int btrfs_drop_verity_items(struct btrfs_inode *inode)
-{
-	return 0;
-}
-
-static inline int btrfs_get_verity_descriptor(struct inode *inode, void *buf,
-					      size_t buf_size)
-{
-	return -EPERM;
-}
-
-#endif
-
 /* Sanity test specific functions */
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 void btrfs_test_destroy_inode(struct inode *inode);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d8856e621e01..0fda1c5ba28e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -67,6 +67,7 @@
 #include "file.h"
 #include "acl.h"
 #include "relocation.h"
+#include "verity.h"
 
 struct btrfs_iget_args {
 	u64 ino;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index e9b7deccc5fe..3befc0d2d866 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -31,6 +31,7 @@
 #include "dir-item.h"
 #include "file-item.h"
 #include "ioctl.h"
+#include "verity.h"
 
 /*
  * Maximum number of references an extent can have in order for us to attempt to
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 4bfda9be4556..ae49bdf71d32 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -56,6 +56,7 @@
 #include "dir-item.h"
 #include "ioctl.h"
 #include "scrub.h"
+#include "verity.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/btrfs.h>
 
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index 00ba5143a17d..b31d6c7627ff 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -19,6 +19,7 @@
 #include "fs.h"
 #include "accessors.h"
 #include "ioctl.h"
+#include "verity.h"
 
 /*
  * Implementation of the interface defined in struct fsverity_operations.
diff --git a/fs/btrfs/verity.h b/fs/btrfs/verity.h
new file mode 100644
index 000000000000..91c10f7d0a46
--- /dev/null
+++ b/fs/btrfs/verity.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_VERITY_H
+#define BTRFS_VERITY_H
+
+#ifdef CONFIG_FS_VERITY
+
+extern const struct fsverity_operations btrfs_verityops;
+
+int btrfs_drop_verity_items(struct btrfs_inode *inode);
+int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size);
+
+#else
+
+static inline int btrfs_drop_verity_items(struct btrfs_inode *inode)
+{
+	return 0;
+}
+
+static inline int btrfs_get_verity_descriptor(struct inode *inode, void *buf,
+					      size_t buf_size)
+{
+	return -EPERM;
+}
+
+#endif
+
+#endif
-- 
cgit 


From 6a6b4daf92fd8393eeac9631318d48d5b25ee4df Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 26 Oct 2022 15:08:38 -0400
Subject: btrfs: move CONFIG_BTRFS_FS_RUN_SANITY_TESTS checks to fs.h

We already have a few of these in fs.h, move the remaining checks out of
ctree.h into fs.h.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h | 15 ---------------
 fs/btrfs/fs.h    |  9 +++++++++
 2 files changed, 9 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 15bb90536460..27bfedf3a9fb 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -722,16 +722,6 @@ static inline unsigned long get_eb_page_index(unsigned long offset)
 	return offset >> PAGE_SHIFT;
 }
 
-/*
- * Use that for functions that are conditionally exported for sanity tests but
- * otherwise static
- */
-#ifndef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-#define EXPORT_FOR_TESTS static
-#else
-#define EXPORT_FOR_TESTS
-#endif
-
 static inline int is_fstree(u64 rootid)
 {
 	if (rootid == BTRFS_FS_TREE_OBJECTID ||
@@ -741,11 +731,6 @@ static inline int is_fstree(u64 rootid)
 	return 0;
 }
 
-/* Sanity test specific functions */
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-void btrfs_test_destroy_inode(struct inode *inode);
-#endif
-
 static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
 {
 	return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID;
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index a49d11127bf7..7d0da8509567 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -971,11 +971,20 @@ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
 			   &(fs_info)->fs_state)))
 
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+
+#define EXPORT_FOR_TESTS
+
 static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
 {
 	return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
 }
+
+void btrfs_test_destroy_inode(struct inode *inode);
+
 #else
+
+#define EXPORT_FOR_TESTS static
+
 static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
 {
 	return 0;
-- 
cgit 


From c03b22076bd2c9fe88c055a35637a836d986db76 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 26 Oct 2022 15:08:39 -0400
Subject: btrfs: move super prototypes into super.h

Move these out of ctree.h into super.h to cut down on code in ctree.h.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h   |  7 -------
 fs/btrfs/disk-io.c |  1 +
 fs/btrfs/ioctl.c   |  1 +
 fs/btrfs/super.c   |  1 +
 fs/btrfs/super.h   | 12 ++++++++++++
 5 files changed, 15 insertions(+), 7 deletions(-)
 create mode 100644 fs/btrfs/super.h

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 27bfedf3a9fb..c32f6b6ae972 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -682,13 +682,6 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
 int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, u64 offset);
 
-/* super.c */
-int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
-			unsigned long new_flags);
-int btrfs_sync_fs(struct super_block *sb, int wait);
-char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
-					  u64 subvol_objectid);
-
 /*
  * Get the correct offset inside the page of extent buffer.
  *
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 880bd39680bc..f5f793af12a0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -51,6 +51,7 @@
 #include "uuid-tree.h"
 #include "relocation.h"
 #include "scrub.h"
+#include "super.h"
 
 #define BTRFS_SUPER_FLAG_SUPP	(BTRFS_HEADER_FLAG_WRITTEN |\
 				 BTRFS_HEADER_FLAG_RELOC |\
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 98cf16f118b5..6f9f8e8dfea6 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -60,6 +60,7 @@
 #include "ioctl.h"
 #include "file.h"
 #include "scrub.h"
+#include "super.h"
 
 #ifdef CONFIG_64BIT
 /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index ae49bdf71d32..d54bfec8e506 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -57,6 +57,7 @@
 #include "ioctl.h"
 #include "scrub.h"
 #include "verity.h"
+#include "super.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/btrfs.h>
 
diff --git a/fs/btrfs/super.h b/fs/btrfs/super.h
new file mode 100644
index 000000000000..c8875653e628
--- /dev/null
+++ b/fs/btrfs/super.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_SUPER_H
+#define BTRFS_SUPER_H
+
+int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
+			unsigned long new_flags);
+int btrfs_sync_fs(struct super_block *sb, int wait);
+char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
+					  u64 subvol_objectid);
+
+#endif
-- 
cgit 


From 7f0add250f829248281e1745d92648f72192a8f4 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 26 Oct 2022 15:08:40 -0400
Subject: btrfs: move super_block specific helpers into super.h

This will make syncing fs.h to user space a little easier if we can pull
the super block specific helpers out of fs.h and put them in super.h.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c      |  1 +
 fs/btrfs/defrag.c           |  1 +
 fs/btrfs/export.c           |  1 +
 fs/btrfs/extent_io.c        |  1 +
 fs/btrfs/file-item.c        |  1 +
 fs/btrfs/file.c             |  1 +
 fs/btrfs/free-space-cache.c |  1 +
 fs/btrfs/fs.h               | 17 -----------------
 fs/btrfs/inode.c            |  1 +
 fs/btrfs/lzo.c              |  1 +
 fs/btrfs/messages.c         |  1 +
 fs/btrfs/ordered-data.c     |  1 +
 fs/btrfs/props.c            |  1 +
 fs/btrfs/reflink.c          |  1 +
 fs/btrfs/relocation.c       |  1 +
 fs/btrfs/super.h            | 17 +++++++++++++++++
 fs/btrfs/volumes.c          |  1 +
 17 files changed, 32 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 61828f8375e6..e66fa18dbcf7 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -35,6 +35,7 @@
 #include "subpage.h"
 #include "zoned.h"
 #include "file-item.h"
+#include "super.h"
 
 static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
 
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index ed82085acea3..48a9460d49c7 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -15,6 +15,7 @@
 #include "subpage.h"
 #include "defrag.h"
 #include "file-item.h"
+#include "super.h"
 
 static struct kmem_cache *btrfs_inode_defrag_cachep;
 
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index b6bc9684648f..744a02b7fd67 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -8,6 +8,7 @@
 #include "print-tree.h"
 #include "export.h"
 #include "accessors.h"
+#include "super.h"
 
 #define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \
 						 parent_objectid) / 4)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 6373b1565250..bda420d697f4 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -35,6 +35,7 @@
 #include "file-item.h"
 #include "file.h"
 #include "dev-replace.h"
+#include "super.h"
 
 static struct kmem_cache *extent_buffer_cache;
 
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 20d88cd0b602..036d50af5666 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -20,6 +20,7 @@
 #include "fs.h"
 #include "accessors.h"
 #include "file-item.h"
+#include "super.h"
 
 #define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
 				   sizeof(struct btrfs_item) * 2) / \
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b94dc4b2c486..f9bea57abbde 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -36,6 +36,7 @@
 #include "file-item.h"
 #include "ioctl.h"
 #include "file.h"
+#include "super.h"
 
 /* simple helper to fault in pages and copy.  This should go away
  * and be replaced with calls into generic code.
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index aef075b63188..797edb41d0aa 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -29,6 +29,7 @@
 #include "accessors.h"
 #include "file-item.h"
 #include "file.h"
+#include "super.h"
 
 #define BITS_PER_BITMAP		(PAGE_SIZE * 8UL)
 #define MAX_CACHE_BYTES_PER_GIG	SZ_64K
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 7d0da8509567..c7f2a512fba2 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -801,11 +801,6 @@ static inline u64 btrfs_get_last_root_drop_gen(const struct btrfs_fs_info *fs_in
 	return READ_ONCE(fs_info->last_root_drop_gen);
 }
 
-static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
-{
-	return sb->s_fs_info;
-}
-
 /*
  * Take the number of bytes to be checksummed and figure out how many leaves
  * it would require to store the csums for that many bytes.
@@ -947,18 +942,6 @@ static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info)
 		btrfs_fs_closing(fs_info);
 }
 
-static inline void btrfs_set_sb_rdonly(struct super_block *sb)
-{
-	sb->s_flags |= SB_RDONLY;
-	set_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state);
-}
-
-static inline void btrfs_clear_sb_rdonly(struct super_block *sb)
-{
-	sb->s_flags &= ~SB_RDONLY;
-	clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state);
-}
-
 static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
 {
 	clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0fda1c5ba28e..83e5ae6b74ef 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -68,6 +68,7 @@
 #include "acl.h"
 #include "relocation.h"
 #include "verity.h"
+#include "super.h"
 
 struct btrfs_iget_args {
 	u64 ino;
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 6751874a3e69..e7b1ceffcd33 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -16,6 +16,7 @@
 #include "messages.h"
 #include "compression.h"
 #include "ctree.h"
+#include "super.h"
 
 #define LZO_LEN	4
 
diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c
index 196757ee16f1..625bbbbb2608 100644
--- a/fs/btrfs/messages.c
+++ b/fs/btrfs/messages.c
@@ -5,6 +5,7 @@
 #include "discard.h"
 #include "transaction.h"
 #include "space-info.h"
+#include "super.h"
 
 #ifdef CONFIG_PRINTK
 
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 2aea2a17ed95..8fda1949b71b 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -19,6 +19,7 @@
 #include "qgroup.h"
 #include "subpage.h"
 #include "file.h"
+#include "super.h"
 
 static struct kmem_cache *btrfs_ordered_extent_cache;
 
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 9ad15d69718c..0755af0e53e3 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -14,6 +14,7 @@
 #include "space-info.h"
 #include "fs.h"
 #include "accessors.h"
+#include "super.h"
 
 #define BTRFS_PROP_HANDLERS_HT_BITS 8
 static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS);
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 3c962b5d8dbd..9d728107536e 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -14,6 +14,7 @@
 #include "accessors.h"
 #include "file-item.h"
 #include "file.h"
+#include "super.h"
 
 #define BTRFS_MAX_DEDUPE_LEN	SZ_16M
 
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index f31a97d4f9ad..d119986d1599 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -34,6 +34,7 @@
 #include "root-tree.h"
 #include "file-item.h"
 #include "relocation.h"
+#include "super.h"
 
 /*
  * Relocation overview
diff --git a/fs/btrfs/super.h b/fs/btrfs/super.h
index c8875653e628..8dbb909b364f 100644
--- a/fs/btrfs/super.h
+++ b/fs/btrfs/super.h
@@ -9,4 +9,21 @@ int btrfs_sync_fs(struct super_block *sb, int wait);
 char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
 					  u64 subvol_objectid);
 
+static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline void btrfs_set_sb_rdonly(struct super_block *sb)
+{
+	sb->s_flags |= SB_RDONLY;
+	set_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state);
+}
+
+static inline void btrfs_clear_sb_rdonly(struct super_block *sb)
+{
+	sb->s_flags &= ~SB_RDONLY;
+	clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state);
+}
+
 #endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5606060770eb..1c9f7a946657 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -39,6 +39,7 @@
 #include "ioctl.h"
 #include "relocation.h"
 #include "scrub.h"
+#include "super.h"
 
 static struct bio_set btrfs_bioset;
 
-- 
cgit 


From aa5d3003ddee8d7c5c517db072f888e114ff1529 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 26 Oct 2022 15:08:41 -0400
Subject: btrfs: move orphan prototypes into orphan.h

Move these out of ctree.h into orphan.h to cut down on code in ctree.h.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h       |  6 ------
 fs/btrfs/extent-tree.c |  1 +
 fs/btrfs/inode.c       |  1 +
 fs/btrfs/orphan.c      |  1 +
 fs/btrfs/orphan.h      | 11 +++++++++++
 fs/btrfs/root-tree.c   |  1 +
 fs/btrfs/tree-log.c    |  1 +
 fs/btrfs/verity.c      |  1 +
 8 files changed, 17 insertions(+), 6 deletions(-)
 create mode 100644 fs/btrfs/orphan.h

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c32f6b6ae972..5649f8907984 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -676,12 +676,6 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
 }
 int btrfs_leaf_free_space(struct extent_buffer *leaf);
 
-/* orphan.c */
-int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *root, u64 offset);
-int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root, u64 offset);
-
 /*
  * Get the correct offset inside the page of extent buffer.
  *
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 940d4fe23cfb..b037107678c8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -41,6 +41,7 @@
 #include "extent-tree.h"
 #include "root-tree.h"
 #include "file-item.h"
+#include "orphan.h"
 
 #undef SCRAMBLE_DELAYED_REFS
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 83e5ae6b74ef..9abed40ade11 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -69,6 +69,7 @@
 #include "relocation.h"
 #include "verity.h"
 #include "super.h"
+#include "orphan.h"
 
 struct btrfs_iget_args {
 	u64 ino;
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index aa534108c1e2..7a1b021b5669 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -5,6 +5,7 @@
 
 #include "ctree.h"
 #include "disk-io.h"
+#include "orphan.h"
 
 int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, u64 offset)
diff --git a/fs/btrfs/orphan.h b/fs/btrfs/orphan.h
new file mode 100644
index 000000000000..3faab5cbb59a
--- /dev/null
+++ b/fs/btrfs/orphan.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_ORPHAN_H
+#define BTRFS_ORPHAN_H
+
+int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, u64 offset);
+int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, u64 offset);
+
+#endif
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 42f046e5e25f..859874579456 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -15,6 +15,7 @@
 #include "space-info.h"
 #include "accessors.h"
 #include "root-tree.h"
+#include "orphan.h"
 
 /*
  * Read a root item from the tree. In case we detect a root item smaller then
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1c505713511c..b6e99ef99679 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -28,6 +28,7 @@
 #include "dir-item.h"
 #include "file-item.h"
 #include "file.h"
+#include "orphan.h"
 
 #define MAX_CONFLICT_INODES 10
 
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index b31d6c7627ff..bf9eb693a6a7 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -20,6 +20,7 @@
 #include "accessors.h"
 #include "ioctl.h"
 #include "verity.h"
+#include "orphan.h"
 
 /*
  * Implementation of the interface defined in struct fsverity_operations.
-- 
cgit 


From d52a1365258b5781c2635937afa91af66322d981 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Fri, 16 Sep 2022 15:28:35 +0800
Subject: btrfs: selftests: remove impossible inline extent at non-zero file
 offset

In our inode-tests.c, we create an inline offset at file offset 5, which
is no longer possible since the introduction of tree-checker.

Thus I don't think we should spend time maintaining some corner cases
which are already ruled out by tree-checker.

So this patch will:

- Change the inline extent to start at file offset 0

  Also change its length to 6 to cover the original length

- Add an extra ASSERT() for btrfs_add_extent_mapping()

  This is to make sure tree-checker is working correctly.

- Update the inode selftest

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_map.c        |  7 ++++++
 fs/btrfs/tests/inode-tests.c | 57 +++++++++++++++-----------------------------
 2 files changed, 26 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index cd45303f344d..4a4362f5cc52 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -638,6 +638,13 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
 	int ret;
 	struct extent_map *em = *em_in;
 
+	/*
+	 * Tree-checker should have rejected any inline extent with non-zero
+	 * file offset. Here just do a sanity check.
+	 */
+	if (em->block_start == EXTENT_MAP_INLINE)
+		ASSERT(em->start == 0);
+
 	ret = add_extent_mapping(em_tree, em, 0);
 	/* it is possible that someone inserted the extent into the tree
 	 * while we had the lock dropped.  It is also possible that
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 0a34a54ea9fd..05b03f5eab83 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -73,8 +73,8 @@ static void insert_inode_item_key(struct btrfs_root *root)
  * diagram of how the extents will look though this may not be possible we still
  * want to make sure everything acts normally (the last number is not inclusive)
  *
- * [0 - 5][5 -  6][     6 - 4096     ][ 4096 - 4100][4100 - 8195][8195 - 12291]
- * [hole ][inline][hole but no extent][  hole   ][   regular ][regular1 split]
+ * [0  - 6][     6 - 4096     ][ 4096 - 4100][4100 - 8195][8195  -  12291]
+ * [inline][hole but no extent][    hole    ][   regular ][regular1 split]
  *
  * [12291 - 16387][16387 - 24579][24579 - 28675][ 28675 - 32771][32771 - 36867 ]
  * [    hole    ][regular1 split][   prealloc ][   prealloc1  ][prealloc1 written]
@@ -91,19 +91,12 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize)
 	u64 disk_bytenr = SZ_1M;
 	u64 offset = 0;
 
-	/* First we want a hole */
-	insert_extent(root, offset, 5, 5, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0,
-		      slot);
-	slot++;
-	offset += 5;
-
 	/*
-	 * Now we want an inline extent, I don't think this is possible but hey
-	 * why not?  Also keep in mind if we have an inline extent it counts as
-	 * the whole first page.  If we were to expand it we would have to cow
-	 * and we wouldn't have an inline extent anymore.
+	 * Tree-checker has strict limits on inline extents that they can only
+	 * exist at file offset 0, thus we can only have one inline file extent
+	 * at most.
 	 */
-	insert_extent(root, offset, 1, 1, 0, 0, 0, BTRFS_FILE_EXTENT_INLINE, 0,
+	insert_extent(root, offset, 6, 6, 0, 0, 0, BTRFS_FILE_EXTENT_INLINE, 0,
 		      slot);
 	slot++;
 	offset = sectorsize;
@@ -282,37 +275,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 		test_err("got an error when we shouldn't have");
 		goto out;
 	}
-	if (em->block_start != EXTENT_MAP_HOLE) {
-		test_err("expected a hole, got %llu", em->block_start);
-		goto out;
-	}
-	if (em->start != 0 || em->len != 5) {
-		test_err(
-		"unexpected extent wanted start 0 len 5, got start %llu len %llu",
-			em->start, em->len);
-		goto out;
-	}
-	if (em->flags != 0) {
-		test_err("unexpected flags set, want 0 have %lu", em->flags);
-		goto out;
-	}
-	offset = em->start + em->len;
-	free_extent_map(em);
-
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
-	if (IS_ERR(em)) {
-		test_err("got an error when we shouldn't have");
-		goto out;
-	}
 	if (em->block_start != EXTENT_MAP_INLINE) {
 		test_err("expected an inline, got %llu", em->block_start);
 		goto out;
 	}
 
-	if (em->start != offset || em->len != (sectorsize - 5)) {
+	/*
+	 * For inline extent, we always round up the em to sectorsize, as
+	 * they are either:
+	 *
+	 * a) a hidden hole
+	 *    The range will be zeroed at inline extent read time.
+	 *
+	 * b) a file extent with unaligned bytenr
+	 *    Tree checker will reject it.
+	 */
+	if (em->start != 0 || em->len != sectorsize) {
 		test_err(
-	"unexpected extent wanted start %llu len 1, got start %llu len %llu",
-			offset, em->start, em->len);
+	"unexpected extent wanted start 0 len %u, got start %llu len %llu",
+			sectorsize, em->start, em->len);
 		goto out;
 	}
 	if (em->flags != 0) {
-- 
cgit 


From affc5424338682641829cc84293ec90f36425034 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Fri, 16 Sep 2022 15:28:36 +0800
Subject: btrfs: make inline extent read calculation much simpler

Currently we calculate inline extent read in a way that inline extent
can start at non-zero offset.

This is consistent with the inode selftests, which puts an inline extent
at file offset 5.

Meanwhile the inline extent creation code will only create inline extent
at file offset 0.

Furthermore with the introduction of tree-checker on file extents, we are
actively rejecting inline extent which starts at non-zero file offset.
And so far we haven't yet seen any report of rejected inline extents at
non-zero file offset.

This all means, the extra calculation to support inline extents at
non-zero file offset is mostly paper weight, and damaging the
readability of the code.

Thus this patch will:

- Add extra ASSERT()s to make sure involved file offset are all 0

- Remove @extent_offset calculation

- Simplify the involved code
  As several variables are now single-use, no need to declare them as
  a variable anymore.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9abed40ade11..a374dd718315 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7064,41 +7064,43 @@ next:
 	    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
 		goto insert;
 	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
-		unsigned long ptr;
 		char *map;
-		size_t size;
-		size_t extent_offset;
 		size_t copy_size;
 
 		if (!page)
 			goto out;
 
-		size = btrfs_file_extent_ram_bytes(leaf, item);
-		extent_offset = page_offset(page) + pg_offset - extent_start;
-		copy_size = min_t(u64, PAGE_SIZE - pg_offset,
-				  size - extent_offset);
-		em->start = extent_start + extent_offset;
+		/*
+		 * Inline extent can only exist at file offset 0. This is
+		 * ensured by tree-checker and inline extent creation path.
+		 * Thus all members representing file offsets should be zero.
+		 */
+		ASSERT(page_offset(page) == 0);
+		ASSERT(pg_offset == 0);
+		ASSERT(extent_start == 0);
+		ASSERT(em->start == 0);
+
+		copy_size = min_t(u64, PAGE_SIZE,
+				  btrfs_file_extent_ram_bytes(leaf, item));
+		em->start = extent_start;
 		em->len = ALIGN(copy_size, fs_info->sectorsize);
 		em->orig_block_len = em->len;
 		em->orig_start = em->start;
-		ptr = btrfs_file_extent_inline_start(item) + extent_offset;
 
 		if (!PageUptodate(page)) {
 			if (btrfs_file_extent_compression(leaf, item) !=
 			    BTRFS_COMPRESS_NONE) {
-				ret = uncompress_inline(path, page, pg_offset,
-							extent_offset, item);
+				ret = uncompress_inline(path, page, 0, 0, item);
 				if (ret)
 					goto out;
 			} else {
 				map = kmap_local_page(page);
-				read_extent_buffer(leaf, map + pg_offset, ptr,
-						   copy_size);
-				if (pg_offset + copy_size < PAGE_SIZE) {
-					memset(map + pg_offset + copy_size, 0,
-					       PAGE_SIZE - pg_offset -
-					       copy_size);
-				}
+				read_extent_buffer(leaf, map,
+					btrfs_file_extent_inline_start(item),
+					copy_size);
+				if (copy_size < PAGE_SIZE)
+					memset(map + copy_size, 0,
+					       PAGE_SIZE - copy_size);
 				kunmap_local(map);
 			}
 			flush_dcache_page(page);
-- 
cgit 


From a196a8944f77b7b762795a0862d8aa4a005625a4 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Fri, 16 Sep 2022 15:28:37 +0800
Subject: btrfs: do not reset extent map members for inline extents read

Currently for inline extents read inside btrfs_get_extent(), we will
reset several extent map members:

- em->start

  Reset to extent_start, which is completely unnecessary.
  The extent_start and em->start should have already be zero, ensured by
  tree-checker already.

- em->len

  Reset the round_up(copy_size, fs_info->sectorsize), which is again
  unnecessary.

- em->orig_block_len

  Reset to em->len (sectorsize), while it is originally unset from
  btrfs_extent_item_to_extent_map().

  This makes no difference, as all extent map handling paths will
  ignore the orig_block_len if they found it's an inlined extent.

  Such inline extent orig_block_len ignoring examples can be found in
  btrfs_drop_extent_cache().

- em->orig_start

  Reset to em->start (0), while it is originally set to EXTENT_MAP_HOLE.

  This makes no difference either, as all extent map handling paths will
  ignore the em->orig_start if they found it's an inline extent.

Thus all these em members resetting are unnecessary.

Replace them with ASSERT()s checking the only two members (block_start
and length) that make sense.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a374dd718315..5b45eb141215 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7082,10 +7082,15 @@ next:
 
 		copy_size = min_t(u64, PAGE_SIZE,
 				  btrfs_file_extent_ram_bytes(leaf, item));
-		em->start = extent_start;
-		em->len = ALIGN(copy_size, fs_info->sectorsize);
-		em->orig_block_len = em->len;
-		em->orig_start = em->start;
+
+		/*
+		 * btrfs_extent_item_to_extent_map() should have properly
+		 * initialized em members already.
+		 *
+		 * Other members are not utilized for inline extents.
+		 */
+		ASSERT(em->block_start == EXTENT_MAP_INLINE);
+		ASSERT(em->len = fs_info->sectorsize);
 
 		if (!PageUptodate(page)) {
 			if (btrfs_file_extent_compression(leaf, item) !=
-- 
cgit 


From 280f15cb96a61e4122bb28cdc316343ff4918b7d Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Fri, 16 Sep 2022 15:28:38 +0800
Subject: btrfs: remove new_inline argument from
 btrfs_extent_item_to_extent_map()

The argument @new_inline changes the following members of extent_map:

- em->compress_type
- EXTENT_FLAG_COMPRESSED of em->flags

However neither members makes a difference for inline extents:

- Inline extent read never use above em members

  As inside btrfs_get_extent() we directly use the file extent item to
  do the read.

- Inline extents are never to be split

  Thus code really needs em->compress_type or that flag will never be
  executed on inlined extents.
  (btrfs_drop_extent_cache() would be one example)

- Fiemap no longer relies on extent maps

  Recent fiemap optimization makes fiemap to search subvolume tree
  directly, without using any extent map at all.

  Thus those members make no difference for inline extents any more.

Furthermore such exception without much explanation is really a source
of confusion.

Thus this patch will completely remove the argument, and always set the
involved members, unifying the behavior.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/defrag.c    | 2 +-
 fs/btrfs/file-item.c | 6 ++----
 fs/btrfs/file-item.h | 1 -
 fs/btrfs/inode.c     | 2 +-
 4 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index 48a9460d49c7..919dfe0f7e50 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -582,7 +582,7 @@ iterate:
 			goto next;
 
 		/* Now this extent covers @start, convert it to em */
-		btrfs_extent_item_to_extent_map(inode, &path, fi, false, em);
+		btrfs_extent_item_to_extent_map(inode, &path, fi, em);
 		break;
 next:
 		ret = btrfs_next_item(root, &path);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 036d50af5666..456f71b42a9c 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -1214,7 +1214,6 @@ out:
 void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
 				     const struct btrfs_path *path,
 				     struct btrfs_file_extent_item *fi,
-				     const bool new_inline,
 				     struct extent_map *em)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -1266,10 +1265,9 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
 		 */
 		em->orig_start = EXTENT_MAP_HOLE;
 		em->block_len = (u64)-1;
-		if (!new_inline && compress_type != BTRFS_COMPRESS_NONE) {
+		em->compress_type = compress_type;
+		if (compress_type != BTRFS_COMPRESS_NONE)
 			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
-			em->compress_type = compress_type;
-		}
 	} else {
 		btrfs_err(fs_info,
 			  "unknown file extent item type %d, inode %llu, offset %llu, "
diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h
index 51cd3ab5948c..ba12711cf933 100644
--- a/fs/btrfs/file-item.h
+++ b/fs/btrfs/file-item.h
@@ -24,7 +24,6 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
 				     const struct btrfs_path *path,
 				     struct btrfs_file_extent_item *fi,
-				     const bool new_inline,
 				     struct extent_map *em);
 int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
 					u64 len);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5b45eb141215..0ab899132dbf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7058,7 +7058,7 @@ next:
 		goto insert;
 	}
 
-	btrfs_extent_item_to_extent_map(inode, path, item, !page, em);
+	btrfs_extent_item_to_extent_map(inode, path, item, em);
 
 	if (extent_type == BTRFS_FILE_EXTENT_REG ||
 	    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
-- 
cgit 


From a982fc822001e26930b502b16534f6bbd42b232a Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Fri, 16 Sep 2022 15:28:39 +0800
Subject: btrfs: extract the inline extent read code into its own function

Currently we have inline extent read code behind two levels of
indentation, factor them them out into a new function,
read_inline_extent(), to make it a little easier to read.

Since we're here, also remove @extent_offset and @pg_offset arguments
from uncompress_inline() function, as it's not possible to have inline
extents at non-inline file offset.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 69 ++++++++++++++++++++++++++++----------------------------
 1 file changed, 34 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0ab899132dbf..1422f072a7b8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6865,7 +6865,6 @@ static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
 
 static noinline int uncompress_inline(struct btrfs_path *path,
 				      struct page *page,
-				      size_t pg_offset, u64 extent_offset,
 				      struct btrfs_file_extent_item *item)
 {
 	int ret;
@@ -6876,7 +6875,6 @@ static noinline int uncompress_inline(struct btrfs_path *path,
 	unsigned long ptr;
 	int compress_type;
 
-	WARN_ON(pg_offset != 0);
 	compress_type = btrfs_file_extent_compression(leaf, item);
 	max_size = btrfs_file_extent_ram_bytes(leaf, item);
 	inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
@@ -6888,8 +6886,7 @@ static noinline int uncompress_inline(struct btrfs_path *path,
 	read_extent_buffer(leaf, tmp, ptr, inline_size);
 
 	max_size = min_t(unsigned long, PAGE_SIZE, max_size);
-	ret = btrfs_decompress(compress_type, tmp, page,
-			       extent_offset, inline_size, max_size);
+	ret = btrfs_decompress(compress_type, tmp, page, 0, inline_size, max_size);
 
 	/*
 	 * decompression code contains a memset to fill in any space between the end
@@ -6899,13 +6896,40 @@ static noinline int uncompress_inline(struct btrfs_path *path,
 	 * cover that region here.
 	 */
 
-	if (max_size + pg_offset < PAGE_SIZE)
-		memzero_page(page,  pg_offset + max_size,
-			     PAGE_SIZE - max_size - pg_offset);
+	if (max_size < PAGE_SIZE)
+		memzero_page(page, max_size, PAGE_SIZE - max_size);
 	kfree(tmp);
 	return ret;
 }
 
+static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path,
+			      struct page *page)
+{
+	struct btrfs_file_extent_item *fi;
+	void *kaddr;
+	size_t copy_size;
+
+	if (!page || PageUptodate(page))
+		return 0;
+
+	ASSERT(page_offset(page) == 0);
+
+	fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			    struct btrfs_file_extent_item);
+	if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE)
+		return uncompress_inline(path, page, fi);
+
+	copy_size = min_t(u64, PAGE_SIZE,
+			  btrfs_file_extent_ram_bytes(path->nodes[0], fi));
+	kaddr = kmap_local_page(page);
+	read_extent_buffer(path->nodes[0], kaddr,
+			   btrfs_file_extent_inline_start(fi), copy_size);
+	kunmap_local(kaddr);
+	if (copy_size < PAGE_SIZE)
+		memzero_page(page, copy_size, PAGE_SIZE - copy_size);
+	return 0;
+}
+
 /*
  * Lookup the first extent overlapping a range in a file.
  *
@@ -7064,25 +7088,15 @@ next:
 	    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
 		goto insert;
 	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
-		char *map;
-		size_t copy_size;
-
-		if (!page)
-			goto out;
-
 		/*
 		 * Inline extent can only exist at file offset 0. This is
 		 * ensured by tree-checker and inline extent creation path.
 		 * Thus all members representing file offsets should be zero.
 		 */
-		ASSERT(page_offset(page) == 0);
 		ASSERT(pg_offset == 0);
 		ASSERT(extent_start == 0);
 		ASSERT(em->start == 0);
 
-		copy_size = min_t(u64, PAGE_SIZE,
-				  btrfs_file_extent_ram_bytes(leaf, item));
-
 		/*
 		 * btrfs_extent_item_to_extent_map() should have properly
 		 * initialized em members already.
@@ -7092,24 +7106,9 @@ next:
 		ASSERT(em->block_start == EXTENT_MAP_INLINE);
 		ASSERT(em->len = fs_info->sectorsize);
 
-		if (!PageUptodate(page)) {
-			if (btrfs_file_extent_compression(leaf, item) !=
-			    BTRFS_COMPRESS_NONE) {
-				ret = uncompress_inline(path, page, 0, 0, item);
-				if (ret)
-					goto out;
-			} else {
-				map = kmap_local_page(page);
-				read_extent_buffer(leaf, map,
-					btrfs_file_extent_inline_start(item),
-					copy_size);
-				if (copy_size < PAGE_SIZE)
-					memset(map + copy_size, 0,
-					       PAGE_SIZE - copy_size);
-				kunmap_local(map);
-			}
-			flush_dcache_page(page);
-		}
+		ret = read_inline_extent(inode, path, page);
+		if (ret < 0)
+			goto out;
 		goto insert;
 	}
 not_found:
-- 
cgit 


From c30ff698da87aeabb459cca75fe4a77a858609fb Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 18:12:04 +0200
Subject: btrfs: fix SPDX comment in tree-mod-log.h

The header files should use the /* */ comment style, introduced in
commit f3a84ccd28d0 ("btrfs: move the tree mod log code into its own
file").

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-mod-log.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h
index 8cffe0bc2a39..94f10afeee97 100644
--- a/fs/btrfs/tree-mod-log.h
+++ b/fs/btrfs/tree-mod-log.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
 
 #ifndef BTRFS_TREE_MOD_LOG_H
 #define BTRFS_TREE_MOD_LOG_H
-- 
cgit 


From 20af93d97f46a824647326278df7d377ba3269ff Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 31 Oct 2022 11:43:56 +0000
Subject: btrfs: update stale comment for nowait direct IO writes

If when doing a direct IO write we need to fallback to buffered IO, we
this comment at btrfs_direct_write() that says we can't directly fallback
to buffered IO if we have a NOWAIT iocb, because we have no support for
NOWAIT buffered writes. That is not true anymore, as support for NOWAIT
buffered writes was added recently in commit 926078b21db9 ("btrfs: enable
nowait async buffered writes").

However we still can't fallback to a buffered write in case we have a
NOWAIT iocb, because we'll need to flush delalloc and wait for it to
complete after doing the buffered write, and that can block for several
reasons, the main reason being waiting for IO to complete.

So update the comment to mention all that.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f9bea57abbde..f8be9d629e75 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1575,8 +1575,8 @@ buffered:
 	/*
 	 * If we are in a NOWAIT context, then return -EAGAIN to signal the caller
 	 * it must retry the operation in a context where blocking is acceptable,
-	 * since we currently don't have NOWAIT semantics support for buffered IO
-	 * and may block there for many reasons (reserving space for example).
+	 * because even if we end up not blocking during the buffered IO attempt
+	 * below, we will block when flushing and waiting for the IO.
 	 */
 	if (iocb->ki_flags & IOCB_NOWAIT) {
 		err = -EAGAIN;
-- 
cgit 


From 428c8e03109e717d90e0d1329dc3d926b9421ad3 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 26 Oct 2022 23:25:14 +0200
Subject: btrfs: simplify percent calculation helpers, rename div_factor

The div_factor* helpers calculate fraction or percentage fraction. The
name is a bit confusing, we use it only for percentage calculations and
there are two helpers.

There's a helper mult_frac that's for general fractions, that tries to
be accurate but we multiply and divide by small numbers so we can use
the div_u64 helper.

Rename the div_factor* helpers and use 1..100 percentage range, also drop
the case checking for percentage == 100, it's never hit.

The conversions:

* div_factor calculates tenths and the numbers need to be adjusted
* div_factor_fine is direct replacement

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c      |  6 +++---
 fs/btrfs/block-rsv.c        |  4 ++--
 fs/btrfs/block-rsv.h        |  2 +-
 fs/btrfs/free-space-cache.c |  3 +--
 fs/btrfs/misc.h             | 16 ++--------------
 fs/btrfs/space-info.c       |  4 ++--
 fs/btrfs/sysfs.c            |  2 +-
 fs/btrfs/transaction.c      |  2 +-
 fs/btrfs/volumes.c          | 12 +++++-------
 9 files changed, 18 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index cfe495d39ebd..ea5fcb665daa 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1551,7 +1551,7 @@ static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_f
 	if (reclaim_thresh == 0)
 		return false;
 
-	thresh = div_factor_fine(bg->length, reclaim_thresh);
+	thresh = mult_perc(bg->length, reclaim_thresh);
 
 	/*
 	 * If we were below the threshold before don't reclaim, we are likely a
@@ -3523,13 +3523,13 @@ static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
 	 */
 	if (force == CHUNK_ALLOC_LIMITED) {
 		thresh = btrfs_super_total_bytes(fs_info->super_copy);
-		thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
+		thresh = max_t(u64, SZ_64M, mult_perc(thresh, 1));
 
 		if (sinfo->total_bytes - bytes_used < thresh)
 			return 1;
 	}
 
-	if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
+	if (bytes_used + SZ_2M < mult_perc(sinfo->total_bytes, 80))
 		return 0;
 	return 1;
 }
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index 29705256727f..5367a14d44d2 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -227,7 +227,7 @@ int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
 	return ret;
 }
 
-int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
+int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent)
 {
 	u64 num_bytes = 0;
 	int ret = -ENOSPC;
@@ -236,7 +236,7 @@ int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
 		return 0;
 
 	spin_lock(&block_rsv->lock);
-	num_bytes = div_factor(block_rsv->size, min_factor);
+	num_bytes = mult_perc(block_rsv->size, min_percent);
 	if (block_rsv->reserved >= num_bytes)
 		ret = 0;
 	spin_unlock(&block_rsv->lock);
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index e2cf0dd43203..4cc41c9aaa82 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -63,7 +63,7 @@ void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
 int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
 			struct btrfs_block_rsv *block_rsv, u64 num_bytes,
 			enum btrfs_reserve_flush_enum flush);
-int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor);
+int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent);
 int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info,
 			   struct btrfs_block_rsv *block_rsv, u64 min_reserved,
 			   enum btrfs_reserve_flush_enum flush);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 797edb41d0aa..627bd6120368 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2726,8 +2726,7 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
 		btrfs_mark_bg_unused(block_group);
 	} else if (bg_reclaim_threshold &&
 		   reclaimable_unusable >=
-		   div_factor_fine(block_group->zone_capacity,
-				   bg_reclaim_threshold)) {
+		   mult_perc(block_group->zone_capacity, bg_reclaim_threshold)) {
 		btrfs_mark_bg_to_reclaim(block_group);
 	}
 
diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h
index 69826ccd6644..768583a440e1 100644
--- a/fs/btrfs/misc.h
+++ b/fs/btrfs/misc.h
@@ -40,22 +40,10 @@ static inline void cond_wake_up_nomb(struct wait_queue_head *wq)
 		wake_up(wq);
 }
 
-static inline u64 div_factor(u64 num, int factor)
+static inline u64 mult_perc(u64 num, u32 percent)
 {
-	if (factor == 10)
-		return num;
-	num *= factor;
-	return div_u64(num, 10);
+	return div_u64(num * percent, 100);
 }
-
-static inline u64 div_factor_fine(u64 num, int factor)
-{
-	if (factor == 100)
-		return num;
-	num *= factor;
-	return div_u64(num, 100);
-}
-
 /* Copy of is_power_of_two that is 64bit safe */
 static inline bool is_power_of_two_u64(u64 n)
 {
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 11e5b5a1eb5a..d28ee4e36f3d 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -859,7 +859,7 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
 	u64 thresh;
 	u64 used;
 
-	thresh = div_factor_fine(total, 90);
+	thresh = mult_perc(total, 90);
 
 	lockdep_assert_held(&space_info->lock);
 
@@ -977,7 +977,7 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info,
 		return false;
 
 	spin_lock(&global_rsv->lock);
-	min_bytes = div_factor(global_rsv->size, 1);
+	min_bytes = mult_perc(global_rsv->size, 10);
 	if (global_rsv->reserved < min_bytes + ticket->bytes) {
 		spin_unlock(&global_rsv->lock);
 		return false;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index a14812f1254a..45615ce36498 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -764,7 +764,7 @@ static ssize_t btrfs_chunk_size_store(struct kobject *kobj,
 	val = min(val, BTRFS_MAX_DATA_CHUNK_SIZE);
 
 	/* Limit stripe size to 10% of available space. */
-	val = min(div_factor(fs_info->fs_devices->total_rw_bytes, 1), val);
+	val = min(mult_perc(fs_info->fs_devices->total_rw_bytes, 10), val);
 
 	/* Must be multiple of 256M. */
 	val &= ~((u64)SZ_256M - 1);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 593b2414e2be..2e2dd2ea109b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -949,7 +949,7 @@ static bool should_end_transaction(struct btrfs_trans_handle *trans)
 	if (btrfs_check_space_for_delayed_refs(fs_info))
 		return true;
 
-	return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5);
+	return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 50);
 }
 
 bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 1c9f7a946657..7751ab620761 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3614,16 +3614,14 @@ static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_off
 	if (bargs->usage_min == 0)
 		user_thresh_min = 0;
 	else
-		user_thresh_min = div_factor_fine(cache->length,
-						  bargs->usage_min);
+		user_thresh_min = mult_perc(cache->length, bargs->usage_min);
 
 	if (bargs->usage_max == 0)
 		user_thresh_max = 1;
 	else if (bargs->usage_max > 100)
 		user_thresh_max = cache->length;
 	else
-		user_thresh_max = div_factor_fine(cache->length,
-						  bargs->usage_max);
+		user_thresh_max = mult_perc(cache->length, bargs->usage_max);
 
 	if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
 		ret = 0;
@@ -3647,7 +3645,7 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
 	else if (bargs->usage > 100)
 		user_thresh = cache->length;
 	else
-		user_thresh = div_factor_fine(cache->length, bargs->usage);
+		user_thresh = mult_perc(cache->length, bargs->usage);
 
 	if (chunk_used < user_thresh)
 		ret = 0;
@@ -5113,7 +5111,7 @@ static void init_alloc_chunk_ctl_policy_regular(
 		ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK);
 
 	/* We don't want a chunk larger than 10% of writable space */
-	ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
+	ctl->max_chunk_size = min(mult_perc(fs_devices->total_rw_bytes, 10),
 				  ctl->max_chunk_size);
 	ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
 }
@@ -5144,7 +5142,7 @@ static void init_alloc_chunk_ctl_policy_zoned(
 	}
 
 	/* We don't want a chunk larger than 10% of writable space */
-	limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1),
+	limit = max(round_down(mult_perc(fs_devices->total_rw_bytes, 10),
 			       zone_size),
 		    min_chunk_size);
 	ctl->max_chunk_size = min(limit, ctl->max_chunk_size);
-- 
cgit 


From 8ec8519b47897d01a7e8c5ad95734529a3dd60bf Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 13:07:03 +0200
Subject: btrfs: switch extent_page_data bit fields to bools

The semantics of the two members is a boolean, so change the type
accordingly.  We have space in extent_page_data due to alignment there's
no change in size.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index bda420d697f4..c1294a6de88a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -109,10 +109,10 @@ struct extent_page_data {
 	/* tells writepage not to lock the state bits for this range
 	 * it still does the unlocking
 	 */
-	unsigned int extent_locked:1;
+	bool extent_locked;
 
 	/* tells the submit_bio code to use REQ_SYNC */
-	unsigned int sync_io:1;
+	bool sync_io;
 };
 
 static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
-- 
cgit 


From ee5f017dccc79e5a9b442ca473631a53e1e7e376 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 13:07:05 +0200
Subject: btrfs: merge struct extent_page_data to btrfs_bio_ctrl

The two structures appear on the same call paths, btrfs_bio_ctrl is
embedded in extent_page_data and we pass bio_ctrl to some functions.
After merging there are fewer indirections and we have only one control
structure. The packing remains same.

The btrfs_bio_ctrl was selected as the target structure as the operation
is closer to bio processing.

Structure layout:

struct btrfs_bio_ctrl {
        struct bio *               bio;                  /*     0     8 */
        int                        mirror_num;           /*     8     4 */
        enum btrfs_compression_type compress_type;       /*    12     4 */
        u32                        len_to_stripe_boundary; /*    16     4 */
        u32                        len_to_oe_boundary;   /*    20     4 */
        btrfs_bio_end_io_t         end_io_func;          /*    24     8 */
        bool                       extent_locked;        /*    32     1 */
        bool                       sync_io;              /*    33     1 */

        /* size: 40, cachelines: 1, members: 8 */
        /* padding: 6 */
        /* last cacheline: 40 bytes */
};

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 115 ++++++++++++++++++++++++---------------------------
 1 file changed, 55 insertions(+), 60 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c1294a6de88a..2ec989b83f54 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -102,16 +102,14 @@ struct btrfs_bio_ctrl {
 	u32 len_to_stripe_boundary;
 	u32 len_to_oe_boundary;
 	btrfs_bio_end_io_t end_io_func;
-};
 
-struct extent_page_data {
-	struct btrfs_bio_ctrl bio_ctrl;
-	/* tells writepage not to lock the state bits for this range
-	 * it still does the unlocking
+	/*
+	 * Tell writepage not to lock the state bits for this range, it still
+	 * does the unlocking.
 	 */
 	bool extent_locked;
 
-	/* tells the submit_bio code to use REQ_SYNC */
+	/* Tell the submit_bio code to use REQ_SYNC */
 	bool sync_io;
 };
 
@@ -148,11 +146,11 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
 }
 
 /*
- * Submit or fail the current bio in an extent_page_data structure.
+ * Submit or fail the current bio in the bio_ctrl structure.
  */
-static void submit_write_bio(struct extent_page_data *epd, int ret)
+static void submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret)
 {
-	struct bio *bio = epd->bio_ctrl.bio;
+	struct bio *bio = bio_ctrl->bio;
 
 	if (!bio)
 		return;
@@ -161,9 +159,9 @@ static void submit_write_bio(struct extent_page_data *epd, int ret)
 		ASSERT(ret < 0);
 		btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
 		/* The bio is owned by the end_io handler now */
-		epd->bio_ctrl.bio = NULL;
+		bio_ctrl->bio = NULL;
 	} else {
-		submit_one_bio(&epd->bio_ctrl);
+		submit_one_bio(bio_ctrl);
 	}
 }
 
@@ -2071,7 +2069,7 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
 static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
 				 struct page *page,
 				 struct writeback_control *wbc,
-				 struct extent_page_data *epd,
+				 struct btrfs_bio_ctrl *bio_ctrl,
 				 loff_t i_size,
 				 int *nr_ret)
 {
@@ -2103,7 +2101,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
 	 */
 	wbc->nr_to_write--;
 
-	epd->bio_ctrl.end_io_func = end_bio_extent_writepage;
+	bio_ctrl->end_io_func = end_bio_extent_writepage;
 	while (cur <= end) {
 		u64 disk_bytenr;
 		u64 em_end;
@@ -2197,7 +2195,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
 		btrfs_page_clear_dirty(fs_info, page, cur, iosize);
 
 		ret = submit_extent_page(op | write_flags, wbc,
-					 &epd->bio_ctrl, disk_bytenr,
+					 bio_ctrl, disk_bytenr,
 					 page, iosize,
 					 cur - page_offset(page),
 					 0, false);
@@ -2237,7 +2235,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
  * Return <0 for error.
  */
 static int __extent_writepage(struct page *page, struct writeback_control *wbc,
-			      struct extent_page_data *epd)
+			      struct btrfs_bio_ctrl *bio_ctrl)
 {
 	struct folio *folio = page_folio(page);
 	struct inode *inode = page->mapping->host;
@@ -2274,7 +2272,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 		goto done;
 	}
 
-	if (!epd->extent_locked) {
+	if (!bio_ctrl->extent_locked) {
 		ret = writepage_delalloc(BTRFS_I(inode), page, wbc);
 		if (ret == 1)
 			return 0;
@@ -2282,7 +2280,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 			goto done;
 	}
 
-	ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size,
+	ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, bio_ctrl, i_size,
 				    &nr);
 	if (ret == 1)
 		return 0;
@@ -2326,9 +2324,9 @@ done:
 	 */
 	if (PageError(page))
 		end_extent_writepage(page, ret, page_start, page_end);
-	if (epd->extent_locked) {
+	if (bio_ctrl->extent_locked) {
 		/*
-		 * If epd->extent_locked, it's from extent_write_locked_range(),
+		 * If bio_ctrl->extent_locked, it's from extent_write_locked_range(),
 		 * the page can either be locked by lock_page() or
 		 * process_one_page().
 		 * Let btrfs_page_unlock_writer() handle both cases.
@@ -2367,7 +2365,7 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb)
  * Return <0 if something went wrong, no page is locked.
  */
 static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb,
-			  struct extent_page_data *epd)
+			  struct btrfs_bio_ctrl *bio_ctrl)
 {
 	struct btrfs_fs_info *fs_info = eb->fs_info;
 	int i, num_pages;
@@ -2375,17 +2373,17 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
 	int ret = 0;
 
 	if (!btrfs_try_tree_write_lock(eb)) {
-		submit_write_bio(epd, 0);
+		submit_write_bio(bio_ctrl, 0);
 		flush = 1;
 		btrfs_tree_lock(eb);
 	}
 
 	if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
 		btrfs_tree_unlock(eb);
-		if (!epd->sync_io)
+		if (!bio_ctrl->sync_io)
 			return 0;
 		if (!flush) {
-			submit_write_bio(epd, 0);
+			submit_write_bio(bio_ctrl, 0);
 			flush = 1;
 		}
 		while (1) {
@@ -2432,7 +2430,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
 
 		if (!trylock_page(p)) {
 			if (!flush) {
-				submit_write_bio(epd, 0);
+				submit_write_bio(bio_ctrl, 0);
 				flush = 1;
 			}
 			lock_page(p);
@@ -2672,7 +2670,7 @@ static void prepare_eb_write(struct extent_buffer *eb)
  */
 static int write_one_subpage_eb(struct extent_buffer *eb,
 				struct writeback_control *wbc,
-				struct extent_page_data *epd)
+				struct btrfs_bio_ctrl *bio_ctrl)
 {
 	struct btrfs_fs_info *fs_info = eb->fs_info;
 	struct page *page = eb->pages[0];
@@ -2692,10 +2690,10 @@ static int write_one_subpage_eb(struct extent_buffer *eb,
 	if (no_dirty_ebs)
 		clear_page_dirty_for_io(page);
 
-	epd->bio_ctrl.end_io_func = end_bio_subpage_eb_writepage;
+	bio_ctrl->end_io_func = end_bio_subpage_eb_writepage;
 
 	ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
-			&epd->bio_ctrl, eb->start, page, eb->len,
+			bio_ctrl, eb->start, page, eb->len,
 			eb->start - page_offset(page), 0, false);
 	if (ret) {
 		btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len);
@@ -2718,7 +2716,7 @@ static int write_one_subpage_eb(struct extent_buffer *eb,
 
 static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
 			struct writeback_control *wbc,
-			struct extent_page_data *epd)
+			struct btrfs_bio_ctrl *bio_ctrl)
 {
 	u64 disk_bytenr = eb->start;
 	int i, num_pages;
@@ -2727,7 +2725,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
 
 	prepare_eb_write(eb);
 
-	epd->bio_ctrl.end_io_func = end_bio_extent_buffer_writepage;
+	bio_ctrl->end_io_func = end_bio_extent_buffer_writepage;
 
 	num_pages = num_extent_pages(eb);
 	for (i = 0; i < num_pages; i++) {
@@ -2736,7 +2734,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
 		clear_page_dirty_for_io(p);
 		set_page_writeback(p);
 		ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
-					 &epd->bio_ctrl, disk_bytenr, p,
+					 bio_ctrl, disk_bytenr, p,
 					 PAGE_SIZE, 0, 0, false);
 		if (ret) {
 			set_btree_ioerr(p, eb);
@@ -2779,7 +2777,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
  */
 static int submit_eb_subpage(struct page *page,
 			     struct writeback_control *wbc,
-			     struct extent_page_data *epd)
+			     struct btrfs_bio_ctrl *bio_ctrl)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
 	int submitted = 0;
@@ -2832,7 +2830,7 @@ static int submit_eb_subpage(struct page *page,
 		if (!eb)
 			continue;
 
-		ret = lock_extent_buffer_for_io(eb, epd);
+		ret = lock_extent_buffer_for_io(eb, bio_ctrl);
 		if (ret == 0) {
 			free_extent_buffer(eb);
 			continue;
@@ -2841,7 +2839,7 @@ static int submit_eb_subpage(struct page *page,
 			free_extent_buffer(eb);
 			goto cleanup;
 		}
-		ret = write_one_subpage_eb(eb, wbc, epd);
+		ret = write_one_subpage_eb(eb, wbc, bio_ctrl);
 		free_extent_buffer(eb);
 		if (ret < 0)
 			goto cleanup;
@@ -2851,7 +2849,7 @@ static int submit_eb_subpage(struct page *page,
 
 cleanup:
 	/* We hit error, end bio for the submitted extent buffers */
-	submit_write_bio(epd, ret);
+	submit_write_bio(bio_ctrl, ret);
 	return ret;
 }
 
@@ -2876,7 +2874,7 @@ cleanup:
  * Return <0 for fatal error.
  */
 static int submit_eb_page(struct page *page, struct writeback_control *wbc,
-			  struct extent_page_data *epd,
+			  struct btrfs_bio_ctrl *bio_ctrl,
 			  struct extent_buffer **eb_context)
 {
 	struct address_space *mapping = page->mapping;
@@ -2888,7 +2886,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
 		return 0;
 
 	if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
-		return submit_eb_subpage(page, wbc, epd);
+		return submit_eb_subpage(page, wbc, bio_ctrl);
 
 	spin_lock(&mapping->private_lock);
 	if (!PagePrivate(page)) {
@@ -2931,7 +2929,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
 
 	*eb_context = eb;
 
-	ret = lock_extent_buffer_for_io(eb, epd);
+	ret = lock_extent_buffer_for_io(eb, bio_ctrl);
 	if (ret <= 0) {
 		btrfs_revert_meta_write_pointer(cache, eb);
 		if (cache)
@@ -2946,7 +2944,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
 		btrfs_schedule_zone_finish_bg(cache, eb);
 		btrfs_put_block_group(cache);
 	}
-	ret = write_one_eb(eb, wbc, epd);
+	ret = write_one_eb(eb, wbc, bio_ctrl);
 	free_extent_buffer(eb);
 	if (ret < 0)
 		return ret;
@@ -2957,10 +2955,9 @@ int btree_write_cache_pages(struct address_space *mapping,
 				   struct writeback_control *wbc)
 {
 	struct extent_buffer *eb_context = NULL;
-	struct extent_page_data epd = {
-		.bio_ctrl = { 0 },
+	struct btrfs_bio_ctrl bio_ctrl = {
 		.extent_locked = 0,
-		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
+		.sync_io = (wbc->sync_mode == WB_SYNC_ALL),
 	};
 	struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
 	int ret = 0;
@@ -3003,7 +3000,7 @@ retry:
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
 
-			ret = submit_eb_page(page, wbc, &epd, &eb_context);
+			ret = submit_eb_page(page, wbc, &bio_ctrl, &eb_context);
 			if (ret == 0)
 				continue;
 			if (ret < 0) {
@@ -3064,7 +3061,7 @@ retry:
 		ret = 0;
 	if (!ret && BTRFS_FS_ERROR(fs_info))
 		ret = -EROFS;
-	submit_write_bio(&epd, ret);
+	submit_write_bio(&bio_ctrl, ret);
 
 	btrfs_zoned_meta_io_unlock(fs_info);
 	return ret;
@@ -3073,9 +3070,9 @@ retry:
 /*
  * Walk the list of dirty pages of the given address space and write all of them.
  *
- * @mapping: address space structure to write
- * @wbc:     subtract the number of written pages from *@wbc->nr_to_write
- * @epd:     holds context for the write, namely the bio
+ * @mapping:   address space structure to write
+ * @wbc:       subtract the number of written pages from *@wbc->nr_to_write
+ * @bio_ctrl:  holds context for the write, namely the bio
  *
  * If a page is already under I/O, write_cache_pages() skips it, even
  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
@@ -3087,7 +3084,7 @@ retry:
  */
 static int extent_write_cache_pages(struct address_space *mapping,
 			     struct writeback_control *wbc,
-			     struct extent_page_data *epd)
+			     struct btrfs_bio_ctrl *bio_ctrl)
 {
 	struct inode *inode = mapping->host;
 	int ret = 0;
@@ -3168,7 +3165,7 @@ retry:
 			 * tmpfs file mapping
 			 */
 			if (!trylock_page(page)) {
-				submit_write_bio(epd, 0);
+				submit_write_bio(bio_ctrl, 0);
 				lock_page(page);
 			}
 
@@ -3179,7 +3176,7 @@ retry:
 
 			if (wbc->sync_mode != WB_SYNC_NONE) {
 				if (PageWriteback(page))
-					submit_write_bio(epd, 0);
+					submit_write_bio(bio_ctrl, 0);
 				wait_on_page_writeback(page);
 			}
 
@@ -3189,7 +3186,7 @@ retry:
 				continue;
 			}
 
-			ret = __extent_writepage(page, wbc, epd);
+			ret = __extent_writepage(page, wbc, bio_ctrl);
 			if (ret < 0) {
 				done = 1;
 				break;
@@ -3219,7 +3216,7 @@ retry:
 		 * page in our current bio, and thus deadlock, so flush the
 		 * write bio here.
 		 */
-		submit_write_bio(epd, 0);
+		submit_write_bio(bio_ctrl, 0);
 		goto retry;
 	}
 
@@ -3245,8 +3242,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
 	u64 cur = start;
 	unsigned long nr_pages;
 	const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize;
-	struct extent_page_data epd = {
-		.bio_ctrl = { 0 },
+	struct btrfs_bio_ctrl bio_ctrl = {
 		.extent_locked = 1,
 		.sync_io = 1,
 	};
@@ -3277,7 +3273,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
 		ASSERT(PageLocked(page));
 		ASSERT(PageDirty(page));
 		clear_page_dirty_for_io(page);
-		ret = __extent_writepage(page, &wbc_writepages, &epd);
+		ret = __extent_writepage(page, &wbc_writepages, &bio_ctrl);
 		ASSERT(ret <= 0);
 		if (ret < 0) {
 			found_error = true;
@@ -3287,7 +3283,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
 		cur = cur_end + 1;
 	}
 
-	submit_write_bio(&epd, found_error ? ret : 0);
+	submit_write_bio(&bio_ctrl, found_error ? ret : 0);
 
 	wbc_detach_inode(&wbc_writepages);
 	if (found_error)
@@ -3300,10 +3296,9 @@ int extent_writepages(struct address_space *mapping,
 {
 	struct inode *inode = mapping->host;
 	int ret = 0;
-	struct extent_page_data epd = {
-		.bio_ctrl = { 0 },
+	struct btrfs_bio_ctrl bio_ctrl = {
 		.extent_locked = 0,
-		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
+		.sync_io = (wbc->sync_mode == WB_SYNC_ALL),
 	};
 
 	/*
@@ -3311,8 +3306,8 @@ int extent_writepages(struct address_space *mapping,
 	 * protect the write pointer updates.
 	 */
 	btrfs_zoned_data_reloc_lock(BTRFS_I(inode));
-	ret = extent_write_cache_pages(mapping, wbc, &epd);
-	submit_write_bio(&epd, ret);
+	ret = extent_write_cache_pages(mapping, wbc, &bio_ctrl);
+	submit_write_bio(&bio_ctrl, ret);
 	btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
 	return ret;
 }
-- 
cgit 


From 9c5ff9b42c1cb22823c94983b7d52121c559bf4d Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 1 Nov 2022 19:16:01 +0800
Subject: btrfs: raid56: extract the vertical stripe recovery code into
 recover_vertical()

This refactor includes the following behavior change first:

- Don't error out if only P/Q is corrupted

  The old code will directly error out if only P/Q is corrupted.
  Although it is an logical error if we go into rebuild path with
  only P/Q corrupted, there is no need to error out.

  Just skip the rebuild and return the already good data.

Then comes the following refactor which shouldn't cause behavior
changes:

- Introduce a helper to do vertical stripe recovery

  This not only reduce one indent level, but also paves the road for
  later data checksum verification in RMW cycles.

- Sort rbio->faila/b before recovery

  So we don't need to do the same swap every vertical stripe

- Replace a BUG_ON() with ASSERT()

  Or checkpatch won't let me pass.

- Mark recovered sectors uptodate after the recover loop

- Do the cleanup for pointers unconditionally

  We only need to initialize @pointers and @unmap_array to NULL, so
  we can safely free them unconditionally.

- Mark the repaired sector uptodate in recover_vertical()

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c | 285 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 149 insertions(+), 136 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index ef272e7d932c..34d51d2ef746 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1886,6 +1886,144 @@ fail:
 	bio_endio(bio);
 }
 
+/*
+ * Recover a vertical stripe specified by @sector_nr.
+ * @*pointers are the pre-allocated pointers by the caller, so we don't
+ * need to allocate/free the pointers again and again.
+ */
+static void recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
+			     void **pointers, void **unmap_array)
+{
+	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
+	struct sector_ptr *sector;
+	const u32 sectorsize = fs_info->sectorsize;
+	const int faila = rbio->faila;
+	const int failb = rbio->failb;
+	int stripe_nr;
+
+	/*
+	 * Now we just use bitmap to mark the horizontal stripes in
+	 * which we have data when doing parity scrub.
+	 */
+	if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
+	    !test_bit(sector_nr, &rbio->dbitmap))
+		return;
+
+	/*
+	 * Setup our array of pointers with sectors from each stripe
+	 *
+	 * NOTE: store a duplicate array of pointers to preserve the
+	 * pointer order.
+	 */
+	for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
+		/*
+		 * If we're rebuilding a read, we have to use
+		 * pages from the bio list
+		 */
+		if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+		     rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
+		    (stripe_nr == faila || stripe_nr == failb)) {
+			sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
+		} else {
+			sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
+		}
+		ASSERT(sector->page);
+		pointers[stripe_nr] = kmap_local_page(sector->page) +
+				   sector->pgoff;
+		unmap_array[stripe_nr] = pointers[stripe_nr];
+	}
+
+	/* All raid6 handling here */
+	if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
+		/* Single failure, rebuild from parity raid5 style */
+		if (failb < 0) {
+			if (faila == rbio->nr_data)
+				/*
+				 * Just the P stripe has failed, without
+				 * a bad data or Q stripe.
+				 * We have nothing to do, just skip the
+				 * recovery for this stripe.
+				 */
+				goto cleanup;
+			/*
+			 * a single failure in raid6 is rebuilt
+			 * in the pstripe code below
+			 */
+			goto pstripe;
+		}
+
+		/*
+		 * If the q stripe is failed, do a pstripe reconstruction from
+		 * the xors.
+		 * If both the q stripe and the P stripe are failed, we're
+		 * here due to a crc mismatch and we can't give them the
+		 * data they want.
+		 */
+		if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
+			if (rbio->bioc->raid_map[faila] ==
+			    RAID5_P_STRIPE)
+				/*
+				 * Only P and Q are corrupted.
+				 * We only care about data stripes recovery,
+				 * can skip this vertical stripe.
+				 */
+				goto cleanup;
+			/*
+			 * Otherwise we have one bad data stripe and
+			 * a good P stripe.  raid5!
+			 */
+			goto pstripe;
+		}
+
+		if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
+			raid6_datap_recov(rbio->real_stripes, sectorsize,
+					  faila, pointers);
+		} else {
+			raid6_2data_recov(rbio->real_stripes, sectorsize,
+					  faila, failb, pointers);
+		}
+	} else {
+		void *p;
+
+		/* Rebuild from P stripe here (raid5 or raid6). */
+		ASSERT(failb == -1);
+pstripe:
+		/* Copy parity block into failed block to start with */
+		memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize);
+
+		/* Rearrange the pointer array */
+		p = pointers[faila];
+		for (stripe_nr = faila; stripe_nr < rbio->nr_data - 1;
+		     stripe_nr++)
+			pointers[stripe_nr] = pointers[stripe_nr + 1];
+		pointers[rbio->nr_data - 1] = p;
+
+		/* Xor in the rest */
+		run_xor(pointers, rbio->nr_data - 1, sectorsize);
+
+	}
+
+	/*
+	 * No matter if this is a RMW or recovery, we should have all
+	 * failed sectors repaired in the vertical stripe, thus they are now
+	 * uptodate.
+	 * Especially if we determine to cache the rbio, we need to
+	 * have at least all data sectors uptodate.
+	 */
+	if (rbio->faila >= 0) {
+		sector = rbio_stripe_sector(rbio, rbio->faila, sector_nr);
+		sector->uptodate = 1;
+	}
+	if (rbio->failb >= 0) {
+		sector = rbio_stripe_sector(rbio, rbio->failb, sector_nr);
+		sector->uptodate = 1;
+	}
+
+cleanup:
+	for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--)
+		kunmap_local(unmap_array[stripe_nr]);
+}
+
 /*
  * all parity reconstruction happens here.  We've read in everything
  * we can find from the drives and this does the heavy lifting of
@@ -1893,13 +2031,10 @@ fail:
  */
 static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
 {
-	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
-	int sectornr, stripe;
-	void **pointers;
-	void **unmap_array;
-	int faila = -1, failb = -1;
+	int sectornr;
+	void **pointers = NULL;
+	void **unmap_array = NULL;
 	blk_status_t err;
-	int i;
 
 	/*
 	 * This array stores the pointer for each sector, thus it has the extra
@@ -1908,7 +2043,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
 	pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
 	if (!pointers) {
 		err = BLK_STS_RESOURCE;
-		goto cleanup_io;
+		goto cleanup;
 	}
 
 	/*
@@ -1918,11 +2053,12 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
 	unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
 	if (!unmap_array) {
 		err = BLK_STS_RESOURCE;
-		goto cleanup_pointers;
+		goto cleanup;
 	}
 
-	faila = rbio->faila;
-	failb = rbio->failb;
+	/* Make sure faila and fail b are in order. */
+	if (rbio->faila >= 0 && rbio->failb >= 0 && rbio->faila > rbio->failb)
+		swap(rbio->faila, rbio->failb);
 
 	if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
 	    rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
@@ -1933,138 +2069,15 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
 
 	index_rbio_pages(rbio);
 
-	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
-		struct sector_ptr *sector;
-
-		/*
-		 * Now we just use bitmap to mark the horizontal stripes in
-		 * which we have data when doing parity scrub.
-		 */
-		if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
-		    !test_bit(sectornr, &rbio->dbitmap))
-			continue;
-
-		/*
-		 * Setup our array of pointers with sectors from each stripe
-		 *
-		 * NOTE: store a duplicate array of pointers to preserve the
-		 * pointer order
-		 */
-		for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
-			/*
-			 * If we're rebuilding a read, we have to use
-			 * pages from the bio list
-			 */
-			if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
-			     rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
-			    (stripe == faila || stripe == failb)) {
-				sector = sector_in_rbio(rbio, stripe, sectornr, 0);
-			} else {
-				sector = rbio_stripe_sector(rbio, stripe, sectornr);
-			}
-			ASSERT(sector->page);
-			pointers[stripe] = kmap_local_page(sector->page) +
-					   sector->pgoff;
-			unmap_array[stripe] = pointers[stripe];
-		}
-
-		/* All raid6 handling here */
-		if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
-			/* Single failure, rebuild from parity raid5 style */
-			if (failb < 0) {
-				if (faila == rbio->nr_data) {
-					/*
-					 * Just the P stripe has failed, without
-					 * a bad data or Q stripe.
-					 * TODO, we should redo the xor here.
-					 */
-					err = BLK_STS_IOERR;
-					goto cleanup;
-				}
-				/*
-				 * a single failure in raid6 is rebuilt
-				 * in the pstripe code below
-				 */
-				goto pstripe;
-			}
-
-			/* make sure our ps and qs are in order */
-			if (faila > failb)
-				swap(faila, failb);
-
-			/* if the q stripe is failed, do a pstripe reconstruction
-			 * from the xors.
-			 * If both the q stripe and the P stripe are failed, we're
-			 * here due to a crc mismatch and we can't give them the
-			 * data they want
-			 */
-			if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
-				if (rbio->bioc->raid_map[faila] ==
-				    RAID5_P_STRIPE) {
-					err = BLK_STS_IOERR;
-					goto cleanup;
-				}
-				/*
-				 * otherwise we have one bad data stripe and
-				 * a good P stripe.  raid5!
-				 */
-				goto pstripe;
-			}
-
-			if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
-				raid6_datap_recov(rbio->real_stripes,
-						  sectorsize, faila, pointers);
-			} else {
-				raid6_2data_recov(rbio->real_stripes,
-						  sectorsize, faila, failb,
-						  pointers);
-			}
-		} else {
-			void *p;
-
-			/* rebuild from P stripe here (raid5 or raid6) */
-			BUG_ON(failb != -1);
-pstripe:
-			/* Copy parity block into failed block to start with */
-			memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize);
-
-			/* rearrange the pointer array */
-			p = pointers[faila];
-			for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
-				pointers[stripe] = pointers[stripe + 1];
-			pointers[rbio->nr_data - 1] = p;
-
-			/* xor in the rest */
-			run_xor(pointers, rbio->nr_data - 1, sectorsize);
-		}
-
-		/*
-		 * No matter if this is a RMW or recovery, we should have all
-		 * failed sectors repaired, thus they are now uptodate.
-		 * Especially if we determine to cache the rbio, we need to
-		 * have at least all data sectors uptodate.
-		 */
-		for (i = 0;  i < rbio->stripe_nsectors; i++) {
-			if (faila != -1) {
-				sector = rbio_stripe_sector(rbio, faila, i);
-				sector->uptodate = 1;
-			}
-			if (failb != -1) {
-				sector = rbio_stripe_sector(rbio, failb, i);
-				sector->uptodate = 1;
-			}
-		}
-		for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--)
-			kunmap_local(unmap_array[stripe]);
-	}
+	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++)
+		recover_vertical(rbio, sectornr, pointers, unmap_array);
 
 	err = BLK_STS_OK;
+
 cleanup:
 	kfree(unmap_array);
-cleanup_pointers:
 	kfree(pointers);
 
-cleanup_io:
 	/*
 	 * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
 	 * valid rbio which is consistent with ondisk content, thus such a
-- 
cgit 


From 30e3c897f4a8d070d4fa079f6dade4b3e3cb74ad Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 1 Nov 2022 19:16:02 +0800
Subject: btrfs: raid56: extract the pq generation code into a helper

Currently finish_rmw() will update the P/Q stripes before submitting
the writes.

It's done behind a for(;;) loop, it's a little congested indent-wise, so
extract the code into a helper called generate_pq_vertical().

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c | 90 +++++++++++++++++++++++++++----------------------------
 1 file changed, 44 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 34d51d2ef746..66dd4e9752a9 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1193,6 +1193,48 @@ not_found:
 	trace_info->stripe_nr = -1;
 }
 
+/* Generate PQ for one veritical stripe. */
+static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
+{
+	void **pointers = rbio->finish_pointers;
+	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+	struct sector_ptr *sector;
+	int stripe;
+	const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6;
+
+	/* First collect one sector from each data stripe */
+	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
+		sector = sector_in_rbio(rbio, stripe, sectornr, 0);
+		pointers[stripe] = kmap_local_page(sector->page) +
+				   sector->pgoff;
+	}
+
+	/* Then add the parity stripe */
+	sector = rbio_pstripe_sector(rbio, sectornr);
+	sector->uptodate = 1;
+	pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff;
+
+	if (has_qstripe) {
+		/*
+		 * RAID6, add the qstripe and call the library function
+		 * to fill in our p/q
+		 */
+		sector = rbio_qstripe_sector(rbio, sectornr);
+		sector->uptodate = 1;
+		pointers[stripe++] = kmap_local_page(sector->page) +
+				     sector->pgoff;
+
+		raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
+					pointers);
+	} else {
+		/* raid5 */
+		memcpy(pointers[rbio->nr_data], pointers[0], sectorsize);
+		run_xor(pointers + 1, rbio->nr_data - 1, sectorsize);
+	}
+	for (stripe = stripe - 1; stripe >= 0; stripe--)
+		kunmap_local(pointers[stripe]);
+}
+
 /*
  * this is called from one of two situations.  We either
  * have a full stripe from the higher layers, or we've read all
@@ -1204,28 +1246,17 @@ not_found:
 static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 {
 	struct btrfs_io_context *bioc = rbio->bioc;
-	const u32 sectorsize = bioc->fs_info->sectorsize;
-	void **pointers = rbio->finish_pointers;
-	int nr_data = rbio->nr_data;
 	/* The total sector number inside the full stripe. */
 	int total_sector_nr;
 	int stripe;
 	/* Sector number inside a stripe. */
 	int sectornr;
-	bool has_qstripe;
 	struct bio_list bio_list;
 	struct bio *bio;
 	int ret;
 
 	bio_list_init(&bio_list);
 
-	if (rbio->real_stripes - rbio->nr_data == 1)
-		has_qstripe = false;
-	else if (rbio->real_stripes - rbio->nr_data == 2)
-		has_qstripe = true;
-	else
-		BUG();
-
 	/* We should have at least one data sector. */
 	ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
 
@@ -1258,41 +1289,8 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 	else
 		clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
 
-	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
-		struct sector_ptr *sector;
-
-		/* First collect one sector from each data stripe */
-		for (stripe = 0; stripe < nr_data; stripe++) {
-			sector = sector_in_rbio(rbio, stripe, sectornr, 0);
-			pointers[stripe] = kmap_local_page(sector->page) +
-					   sector->pgoff;
-		}
-
-		/* Then add the parity stripe */
-		sector = rbio_pstripe_sector(rbio, sectornr);
-		sector->uptodate = 1;
-		pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff;
-
-		if (has_qstripe) {
-			/*
-			 * RAID6, add the qstripe and call the library function
-			 * to fill in our p/q
-			 */
-			sector = rbio_qstripe_sector(rbio, sectornr);
-			sector->uptodate = 1;
-			pointers[stripe++] = kmap_local_page(sector->page) +
-					     sector->pgoff;
-
-			raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
-						pointers);
-		} else {
-			/* raid5 */
-			memcpy(pointers[nr_data], pointers[0], sectorsize);
-			run_xor(pointers + 1, nr_data - 1, sectorsize);
-		}
-		for (stripe = stripe - 1; stripe >= 0; stripe--)
-			kunmap_local(pointers[stripe]);
-	}
+	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++)
+		generate_pq_vertical(rbio, sectornr);
 
 	/*
 	 * Start writing.  Make bios for everything from the higher layers (the
-- 
cgit 


From d31968d9b6ac684e7e64be0d46eb0f5fe38d1841 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 1 Nov 2022 19:16:03 +0800
Subject: btrfs: raid56: extract the recovery bio list build code into a helper

This new helper will be also utilized in the incoming refactor of
recovery path.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c | 64 +++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 41 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 66dd4e9752a9..9234d3423b31 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -2134,30 +2134,14 @@ static void raid_recover_end_io_work(struct work_struct *work)
 		__raid_recover_end_io(rbio);
 }
 
-/*
- * reads everything we need off the disk to reconstruct
- * the parity. endio handlers trigger final reconstruction
- * when the IO is done.
- *
- * This is used both for reads from the higher layers and for
- * parity construction required to finish a rmw cycle.
- */
-static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
+static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio,
+				      struct bio_list *bio_list)
 {
-	int bios_to_read = 0;
-	struct bio_list bio_list;
-	int ret;
-	int total_sector_nr;
 	struct bio *bio;
+	int total_sector_nr;
+	int ret = 0;
 
-	bio_list_init(&bio_list);
-
-	ret = alloc_rbio_pages(rbio);
-	if (ret)
-		goto cleanup;
-
-	atomic_set(&rbio->error, 0);
-
+	ASSERT(bio_list_size(bio_list) == 0);
 	/*
 	 * Read everything that hasn't failed. However this time we will
 	 * not trust any cached sector.
@@ -2180,11 +2164,45 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
 			continue;
 		}
 		sector = rbio_stripe_sector(rbio, stripe, sectornr);
-		ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
+		ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
 					 sectornr, REQ_OP_READ);
 		if (ret < 0)
-			goto cleanup;
+			goto error;
 	}
+	return 0;
+error:
+	while ((bio = bio_list_pop(bio_list)))
+		bio_put(bio);
+
+	return -EIO;
+}
+
+/*
+ * reads everything we need off the disk to reconstruct
+ * the parity. endio handlers trigger final reconstruction
+ * when the IO is done.
+ *
+ * This is used both for reads from the higher layers and for
+ * parity construction required to finish a rmw cycle.
+ */
+static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
+{
+	int bios_to_read = 0;
+	struct bio_list bio_list;
+	int ret;
+	struct bio *bio;
+
+	bio_list_init(&bio_list);
+
+	ret = alloc_rbio_pages(rbio);
+	if (ret)
+		goto cleanup;
+
+	atomic_set(&rbio->error, 0);
+
+	ret = recover_assemble_read_bios(rbio, &bio_list);
+	if (ret < 0)
+		goto cleanup;
 
 	bios_to_read = bio_list_size(&bio_list);
 	if (!bios_to_read) {
-- 
cgit 


From ec936b0354e2d716e977e16165d188db044696b7 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 1 Nov 2022 19:16:04 +0800
Subject: btrfs: raid56: extract sector recovery code into a helper

This includes extra changes:

- The allocation for unmap_array[] and pointers[]
  Now we allocate them in one go, and free them together.

- Remove @err
  Use errno_to_blk_status(ret) instead.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c | 59 +++++++++++++++++++++++++++----------------------------
 1 file changed, 29 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 9234d3423b31..da76e72383c3 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -2022,36 +2022,24 @@ cleanup:
 		kunmap_local(unmap_array[stripe_nr]);
 }
 
-/*
- * all parity reconstruction happens here.  We've read in everything
- * we can find from the drives and this does the heavy lifting of
- * sorting the good from the bad.
- */
-static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
+static int recover_sectors(struct btrfs_raid_bio *rbio)
 {
-	int sectornr;
 	void **pointers = NULL;
 	void **unmap_array = NULL;
-	blk_status_t err;
+	int sectornr;
+	int ret = 0;
 
 	/*
-	 * This array stores the pointer for each sector, thus it has the extra
-	 * pgoff value added from each sector
+	 * @pointers array stores the pointer for each sector.
+	 *
+	 * @unmap_array stores copy of pointers that does not get reordered
+	 * during reconstruction so that kunmap_local works.
 	 */
 	pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
-	if (!pointers) {
-		err = BLK_STS_RESOURCE;
-		goto cleanup;
-	}
-
-	/*
-	 * Store copy of pointers that does not get reordered during
-	 * reconstruction so that kunmap_local works.
-	 */
 	unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
-	if (!unmap_array) {
-		err = BLK_STS_RESOURCE;
-		goto cleanup;
+	if (!pointers || !unmap_array) {
+		ret = -ENOMEM;
+		goto out;
 	}
 
 	/* Make sure faila and fail b are in order. */
@@ -2070,11 +2058,22 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
 	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++)
 		recover_vertical(rbio, sectornr, pointers, unmap_array);
 
-	err = BLK_STS_OK;
-
-cleanup:
-	kfree(unmap_array);
+out:
 	kfree(pointers);
+	kfree(unmap_array);
+	return ret;
+}
+
+/*
+ * all parity reconstruction happens here.  We've read in everything
+ * we can find from the drives and this does the heavy lifting of
+ * sorting the good from the bad.
+ */
+static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
+{
+	int ret;
+
+	ret = recover_sectors(rbio);
 
 	/*
 	 * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
@@ -2098,13 +2097,13 @@ cleanup:
 		 *   Cache this rbio iff the above read reconstruction is
 		 *   executed without problems.
 		 */
-		if (err == BLK_STS_OK && rbio->failb < 0)
+		if (!ret && rbio->failb < 0)
 			cache_rbio_pages(rbio);
 		else
 			clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
 
-		rbio_orig_end_io(rbio, err);
-	} else if (err == BLK_STS_OK) {
+		rbio_orig_end_io(rbio, errno_to_blk_status(ret));
+	} else if (!ret) {
 		rbio->faila = -1;
 		rbio->failb = -1;
 
@@ -2115,7 +2114,7 @@ cleanup:
 		else
 			BUG();
 	} else {
-		rbio_orig_end_io(rbio, err);
+		rbio_orig_end_io(rbio, errno_to_blk_status(ret));
 	}
 }
 
-- 
cgit 


From d817ce35d24a53c6736ac68e759ed83135ff7c3b Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 1 Nov 2022 19:16:05 +0800
Subject: btrfs: raid56: switch recovery path to a single function

Currently btrfs uses end_io functions to jump between different stages
of recovery.

For example, we go the following different functions:

- raid56_bio_end_io()
  This handles the read for all the sectors (except the missing device).

- __raid_recover_end_io()
  This does the real work, it's called inside the delayed work function
  raid_recover_end_io_work().

This one recovery path involves at least 3 different functions, which is
a big burden for readers.

This patch will change the behavior by:

- Introduce a unified recovery entrance, recover_rbio()

- Use submit-and-wait method
  So the workflow is not interrupted by the endio function jump.
  This doesn't bring performance change, but reduce the burden for
  reviewers.

- Run the main function in the rmw_workers workqueue
  Now raid56_parity_recover() only needs to setup the work, and
  queue the work using start_async_work().

Now readers only need to do one function jump (start_async_work()) to
find out the main entrance of recovery path.

Furthermore, recover_rbio() function can easily be reused by other paths.

The old recovery path is still utilized by degraded write path.
It will be cleaned up when we have migrated the write path.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c | 144 ++++++++++++++++++++++++++++++++++++++++++++----------
 fs/btrfs/raid56.h |   2 +
 2 files changed, 119 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index da76e72383c3..2dd87ceda9db 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -67,7 +67,6 @@ struct sector_ptr {
 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
 static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
 static void rmw_work(struct work_struct *work);
-static void read_rebuild_work(struct work_struct *work);
 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
 static void index_rbio_pages(struct btrfs_raid_bio *rbio);
@@ -752,6 +751,8 @@ out:
 	return ret;
 }
 
+static void recover_rbio_work_locked(struct work_struct *work);
+
 /*
  * called as rmw or parity rebuild is completed.  If the plug list has more
  * rbios waiting for this stripe, the next one on the list will be started
@@ -809,10 +810,10 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 			spin_unlock_irqrestore(&h->lock, flags);
 
 			if (next->operation == BTRFS_RBIO_READ_REBUILD)
-				start_async_work(next, read_rebuild_work);
+				start_async_work(next, recover_rbio_work_locked);
 			else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
 				steal_rbio(rbio, next);
-				start_async_work(next, read_rebuild_work);
+				start_async_work(next, recover_rbio_work_locked);
 			} else if (next->operation == BTRFS_RBIO_WRITE) {
 				steal_rbio(rbio, next);
 				start_async_work(next, rmw_work);
@@ -989,6 +990,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
 	}
 
 	bio_list_init(&rbio->bio_list);
+	init_waitqueue_head(&rbio->io_wait);
 	INIT_LIST_HEAD(&rbio->plug_list);
 	spin_lock_init(&rbio->bio_list_lock);
 	INIT_LIST_HEAD(&rbio->stripe_cache);
@@ -1519,6 +1521,39 @@ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
 	}
 }
 
+static void raid_wait_read_end_io(struct bio *bio)
+{
+	struct btrfs_raid_bio *rbio = bio->bi_private;
+
+	if (bio->bi_status)
+		fail_bio_stripe(rbio, bio);
+	else
+		set_bio_pages_uptodate(rbio, bio);
+
+	bio_put(bio);
+	if (atomic_dec_and_test(&rbio->stripes_pending))
+		wake_up(&rbio->io_wait);
+}
+
+static void submit_read_bios(struct btrfs_raid_bio *rbio,
+			     struct bio_list *bio_list)
+{
+	struct bio *bio;
+
+	atomic_set(&rbio->stripes_pending, bio_list_size(bio_list));
+	while ((bio = bio_list_pop(bio_list))) {
+		bio->bi_end_io = raid_wait_read_end_io;
+
+		if (trace_raid56_scrub_read_recover_enabled()) {
+			struct raid56_bio_trace_info trace_info = { 0 };
+
+			bio_get_trace_info(rbio, bio, &trace_info);
+			trace_raid56_scrub_read_recover(rbio, bio, &trace_info);
+		}
+		submit_bio(bio);
+	}
+}
+
 static void raid56_bio_end_io(struct bio *bio)
 {
 	struct btrfs_raid_bio *rbio = bio->bi_private;
@@ -2176,6 +2211,79 @@ error:
 	return -EIO;
 }
 
+static int recover_rbio(struct btrfs_raid_bio *rbio)
+{
+	struct bio_list bio_list;
+	struct bio *bio;
+	int ret;
+
+	/*
+	 * Either we're doing recover for a read failure or degraded write,
+	 * caller should have set faila/b correctly.
+	 */
+	ASSERT(rbio->faila >= 0 || rbio->failb >= 0);
+	bio_list_init(&bio_list);
+
+	/*
+	 * Reset error to 0, as we will later increase error for missing
+	 * devices.
+	 */
+	atomic_set(&rbio->error, 0);
+
+	/* For recovery, we need to read all sectors including P/Q. */
+	ret = alloc_rbio_pages(rbio);
+	if (ret < 0)
+		goto out;
+
+	index_rbio_pages(rbio);
+
+	ret = recover_assemble_read_bios(rbio, &bio_list);
+	if (ret < 0)
+		goto out;
+
+	submit_read_bios(rbio, &bio_list);
+	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
+
+	/* We have more errors than our tolerance during the read. */
+	if (atomic_read(&rbio->error) > rbio->bioc->max_errors) {
+		ret = -EIO;
+		goto out;
+	}
+
+	ret = recover_sectors(rbio);
+
+out:
+	while ((bio = bio_list_pop(&bio_list)))
+		bio_put(bio);
+
+	return ret;
+}
+
+static void recover_rbio_work(struct work_struct *work)
+{
+	struct btrfs_raid_bio *rbio;
+	int ret;
+
+	rbio = container_of(work, struct btrfs_raid_bio, work);
+
+	ret = lock_stripe_add(rbio);
+	if (ret == 0) {
+		ret = recover_rbio(rbio);
+		rbio_orig_end_io(rbio, errno_to_blk_status(ret));
+	}
+}
+
+static void recover_rbio_work_locked(struct work_struct *work)
+{
+	struct btrfs_raid_bio *rbio;
+	int ret;
+
+	rbio = container_of(work, struct btrfs_raid_bio, work);
+
+	ret = recover_rbio(rbio);
+	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
+}
+
 /*
  * reads everything we need off the disk to reconstruct
  * the parity. endio handlers trigger final reconstruction
@@ -2264,7 +2372,8 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
 	rbio = alloc_rbio(fs_info, bioc);
 	if (IS_ERR(rbio)) {
 		bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
-		goto out_end_bio;
+		bio_endio(bio);
+		return;
 	}
 
 	rbio->operation = BTRFS_RBIO_READ_REBUILD;
@@ -2278,7 +2387,8 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
 			   (u64)bio->bi_iter.bi_size, bioc->map_type);
 		free_raid_bio(rbio);
 		bio->bi_status = BLK_STS_IOERR;
-		goto out_end_bio;
+		bio_endio(bio);
+		return;
 	}
 
 	/*
@@ -2298,18 +2408,7 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
 			rbio->failb--;
 	}
 
-	if (lock_stripe_add(rbio))
-		return;
-
-	/*
-	 * This adds our rbio to the list of rbios that will be handled after
-	 * the current lock owner is done.
-	 */
-	__raid56_parity_recover(rbio);
-	return;
-
-out_end_bio:
-	bio_endio(bio);
+	start_async_work(rbio, recover_rbio_work);
 }
 
 static void rmw_work(struct work_struct *work)
@@ -2320,14 +2419,6 @@ static void rmw_work(struct work_struct *work)
 	raid56_rmw_stripe(rbio);
 }
 
-static void read_rebuild_work(struct work_struct *work)
-{
-	struct btrfs_raid_bio *rbio;
-
-	rbio = container_of(work, struct btrfs_raid_bio, work);
-	__raid56_parity_recover(rbio);
-}
-
 /*
  * The following code is used to scrub/replace the parity stripe
  *
@@ -2818,6 +2909,5 @@ raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc)
 
 void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
 {
-	if (!lock_stripe_add(rbio))
-		start_async_work(rbio, read_rebuild_work);
+	start_async_work(rbio, recover_rbio_work);
 }
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 91d5c0adad15..445e833fcfcf 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -95,6 +95,8 @@ struct btrfs_raid_bio {
 
 	atomic_t error;
 
+	wait_queue_head_t io_wait;
+
 	struct work_struct end_io_work;
 
 	/* Bitmap to record which horizontal stripe has data */
-- 
cgit 


From 509c27aa2fb69b4475d05d0ac5c5c4206e28d5d6 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 1 Nov 2022 19:16:06 +0800
Subject: btrfs: raid56: extract the rmw bio list build code into a helper

The helper will later be used to refactor the whole RMW path.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c | 56 +++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 38 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 2dd87ceda9db..88e2204c9f28 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1595,28 +1595,16 @@ static void raid56_rmw_end_io_work(struct work_struct *work)
 	validate_rbio_for_rmw(rbio);
 }
 
-/*
- * the stripe must be locked by the caller.  It will
- * unlock after all the writes are done
- */
-static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
+static int rmw_assemble_read_bios(struct btrfs_raid_bio *rbio,
+				  struct bio_list *bio_list)
 {
-	int bios_to_read = 0;
-	struct bio_list bio_list;
 	const int nr_data_sectors = rbio->stripe_nsectors * rbio->nr_data;
-	int ret;
-	int total_sector_nr;
 	struct bio *bio;
+	int total_sector_nr;
+	int ret = 0;
 
-	bio_list_init(&bio_list);
-
-	ret = alloc_rbio_pages(rbio);
-	if (ret)
-		goto cleanup;
-
-	index_rbio_pages(rbio);
+	ASSERT(bio_list_size(bio_list) == 0);
 
-	atomic_set(&rbio->error, 0);
 	/* Build a list of bios to read all the missing data sectors. */
 	for (total_sector_nr = 0; total_sector_nr < nr_data_sectors;
 	     total_sector_nr++) {
@@ -1641,11 +1629,43 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
 		if (sector->uptodate)
 			continue;
 
-		ret = rbio_add_io_sector(rbio, &bio_list, sector,
+		ret = rbio_add_io_sector(rbio, bio_list, sector,
 			       stripe, sectornr, REQ_OP_READ);
 		if (ret)
 			goto cleanup;
 	}
+	return 0;
+
+cleanup:
+	while ((bio = bio_list_pop(bio_list)))
+		bio_put(bio);
+	return ret;
+}
+
+/*
+ * the stripe must be locked by the caller.  It will
+ * unlock after all the writes are done
+ */
+static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
+{
+	int bios_to_read = 0;
+	struct bio_list bio_list;
+	int ret;
+	struct bio *bio;
+
+	bio_list_init(&bio_list);
+
+	ret = alloc_rbio_pages(rbio);
+	if (ret)
+		goto cleanup;
+
+	index_rbio_pages(rbio);
+
+	atomic_set(&rbio->error, 0);
+
+	ret = rmw_assemble_read_bios(rbio, &bio_list);
+	if (ret < 0)
+		goto cleanup;
 
 	bios_to_read = bio_list_size(&bio_list);
 	if (!bios_to_read) {
-- 
cgit 


From 6486d21c99cb46666c11632e4d595568863b965c Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 1 Nov 2022 19:16:07 +0800
Subject: btrfs: raid56: extract rwm write bios assembly into a helper

The helper will be later used to refactor the rmw write path.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c | 135 ++++++++++++++++++++++++++++++++----------------------
 1 file changed, 79 insertions(+), 56 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 88e2204c9f28..5e5bfc2c21fc 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1237,65 +1237,23 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
 		kunmap_local(pointers[stripe]);
 }
 
-/*
- * this is called from one of two situations.  We either
- * have a full stripe from the higher layers, or we've read all
- * the missing bits off disk.
- *
- * This will calculate the parity and then send down any
- * changed blocks.
- */
-static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
+static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
+				   struct bio_list *bio_list)
 {
-	struct btrfs_io_context *bioc = rbio->bioc;
+	struct bio *bio;
 	/* The total sector number inside the full stripe. */
 	int total_sector_nr;
-	int stripe;
-	/* Sector number inside a stripe. */
 	int sectornr;
-	struct bio_list bio_list;
-	struct bio *bio;
+	int stripe;
 	int ret;
 
-	bio_list_init(&bio_list);
+	ASSERT(bio_list_size(bio_list) == 0);
 
 	/* We should have at least one data sector. */
 	ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
 
-	/* at this point we either have a full stripe,
-	 * or we've read the full stripe from the drive.
-	 * recalculate the parity and write the new results.
-	 *
-	 * We're not allowed to add any new bios to the
-	 * bio list here, anyone else that wants to
-	 * change this stripe needs to do their own rmw.
-	 */
-	spin_lock_irq(&rbio->bio_list_lock);
-	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
-	spin_unlock_irq(&rbio->bio_list_lock);
-
-	atomic_set(&rbio->error, 0);
-
 	/*
-	 * now that we've set rmw_locked, run through the
-	 * bio list one last time and map the page pointers
-	 *
-	 * We don't cache full rbios because we're assuming
-	 * the higher layers are unlikely to use this area of
-	 * the disk again soon.  If they do use it again,
-	 * hopefully they will send another full bio.
-	 */
-	index_rbio_pages(rbio);
-	if (!rbio_is_full(rbio))
-		cache_rbio_pages(rbio);
-	else
-		clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
-
-	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++)
-		generate_pq_vertical(rbio, sectornr);
-
-	/*
-	 * Start writing.  Make bios for everything from the higher layers (the
+	 * Start assembly.  Make bios for everything from the higher layers (the
 	 * bio_list in our rbio) and our P/Q.  Ignore everything else.
 	 */
 	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
@@ -1317,15 +1275,16 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 			sector = rbio_stripe_sector(rbio, stripe, sectornr);
 		}
 
-		ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
+		ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
 					 sectornr, REQ_OP_WRITE);
 		if (ret)
-			goto cleanup;
+			goto error;
 	}
 
-	if (likely(!bioc->num_tgtdevs))
-		goto write_data;
+	if (likely(!rbio->bioc->num_tgtdevs))
+		return 0;
 
+	/* Make a copy for the replace target device. */
 	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
 	     total_sector_nr++) {
 		struct sector_ptr *sector;
@@ -1333,7 +1292,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 		stripe = total_sector_nr / rbio->stripe_nsectors;
 		sectornr = total_sector_nr % rbio->stripe_nsectors;
 
-		if (!bioc->tgtdev_map[stripe]) {
+		if (!rbio->bioc->tgtdev_map[stripe]) {
 			/*
 			 * We can skip the whole stripe completely, note
 			 * total_sector_nr will be increased by one anyway.
@@ -1355,14 +1314,78 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 			sector = rbio_stripe_sector(rbio, stripe, sectornr);
 		}
 
-		ret = rbio_add_io_sector(rbio, &bio_list, sector,
+		ret = rbio_add_io_sector(rbio, bio_list, sector,
 					 rbio->bioc->tgtdev_map[stripe],
 					 sectornr, REQ_OP_WRITE);
 		if (ret)
-			goto cleanup;
+			goto error;
 	}
 
-write_data:
+	return 0;
+error:
+	while ((bio = bio_list_pop(bio_list)))
+		bio_put(bio);
+	return -EIO;
+}
+
+/*
+ * this is called from one of two situations.  We either
+ * have a full stripe from the higher layers, or we've read all
+ * the missing bits off disk.
+ *
+ * This will calculate the parity and then send down any
+ * changed blocks.
+ */
+static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
+{
+	/* The total sector number inside the full stripe. */
+	/* Sector number inside a stripe. */
+	int sectornr;
+	struct bio_list bio_list;
+	struct bio *bio;
+	int ret;
+
+	bio_list_init(&bio_list);
+
+	/* We should have at least one data sector. */
+	ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
+
+	/* at this point we either have a full stripe,
+	 * or we've read the full stripe from the drive.
+	 * recalculate the parity and write the new results.
+	 *
+	 * We're not allowed to add any new bios to the
+	 * bio list here, anyone else that wants to
+	 * change this stripe needs to do their own rmw.
+	 */
+	spin_lock_irq(&rbio->bio_list_lock);
+	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
+	spin_unlock_irq(&rbio->bio_list_lock);
+
+	atomic_set(&rbio->error, 0);
+
+	/*
+	 * now that we've set rmw_locked, run through the
+	 * bio list one last time and map the page pointers
+	 *
+	 * We don't cache full rbios because we're assuming
+	 * the higher layers are unlikely to use this area of
+	 * the disk again soon.  If they do use it again,
+	 * hopefully they will send another full bio.
+	 */
+	index_rbio_pages(rbio);
+	if (!rbio_is_full(rbio))
+		cache_rbio_pages(rbio);
+	else
+		clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+
+	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++)
+		generate_pq_vertical(rbio, sectornr);
+
+	ret = rmw_assemble_write_bios(rbio, &bio_list);
+	if (ret < 0)
+		goto cleanup;
+
 	atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
 	BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
 
-- 
cgit 


From 5eb30ee26fa4dbd2d31f50ee3b4212933f86cb57 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 1 Nov 2022 19:16:08 +0800
Subject: btrfs: raid56: introduce the main entrance for RMW path

The new entrance will be called rmw_rbio(), it will have a streamlined
workflow by using submit-and-wait method.

Thus there will be no weird jumps between tons of functions, thus way
more reader friendly, and will make later expansion easier, as it's now
a straight workflow, the timing is way more clear.

Unfortunately we can not yet migrate the RMW path to use this new
entrance as we still need extra work to address the plug and
unlock_stripe() function.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c | 161 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/raid56.h |   5 ++
 2 files changed, 166 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 5e5bfc2c21fc..8fd633f01d9e 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1252,6 +1252,14 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
 	/* We should have at least one data sector. */
 	ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
 
+	/*
+	 * Reset errors, as we may have errors inherited from from degraded
+	 * write.
+	 */
+	atomic_set(&rbio->error, 0);
+	rbio->faila = -1;
+	rbio->failb = -1;
+
 	/*
 	 * Start assembly.  Make bios for everything from the higher layers (the
 	 * bio_list in our rbio) and our P/Q.  Ignore everything else.
@@ -1665,6 +1673,19 @@ cleanup:
 	return ret;
 }
 
+static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio)
+{
+	const int data_pages = rbio->nr_data * rbio->stripe_npages;
+	int ret;
+
+	ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages);
+	if (ret < 0)
+		return ret;
+
+	index_stripe_sectors(rbio);
+	return 0;
+}
+
 /*
  * the stripe must be locked by the caller.  It will
  * unlock after all the writes are done
@@ -2454,6 +2475,146 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
 	start_async_work(rbio, recover_rbio_work);
 }
 
+static int rmw_read_and_wait(struct btrfs_raid_bio *rbio)
+{
+	struct bio_list bio_list;
+	struct bio *bio;
+	int ret;
+
+	bio_list_init(&bio_list);
+	atomic_set(&rbio->error, 0);
+
+	ret = rmw_assemble_read_bios(rbio, &bio_list);
+	if (ret < 0)
+		goto out;
+
+	submit_read_bios(rbio, &bio_list);
+	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
+	return ret;
+out:
+	while ((bio = bio_list_pop(&bio_list)))
+		bio_put(bio);
+
+	return ret;
+}
+
+static void raid_wait_write_end_io(struct bio *bio)
+{
+	struct btrfs_raid_bio *rbio = bio->bi_private;
+	blk_status_t err = bio->bi_status;
+
+	if (err)
+		fail_bio_stripe(rbio, bio);
+	bio_put(bio);
+	if (atomic_dec_and_test(&rbio->stripes_pending))
+		wake_up(&rbio->io_wait);
+}
+
+static void submit_write_bios(struct btrfs_raid_bio *rbio,
+			      struct bio_list *bio_list)
+{
+	struct bio *bio;
+
+	atomic_set(&rbio->stripes_pending, bio_list_size(bio_list));
+	while ((bio = bio_list_pop(bio_list))) {
+		bio->bi_end_io = raid_wait_write_end_io;
+
+		if (trace_raid56_write_stripe_enabled()) {
+			struct raid56_bio_trace_info trace_info = { 0 };
+
+			bio_get_trace_info(rbio, bio, &trace_info);
+			trace_raid56_write_stripe(rbio, bio, &trace_info);
+		}
+		submit_bio(bio);
+	}
+}
+
+int rmw_rbio(struct btrfs_raid_bio *rbio)
+{
+	struct bio_list bio_list;
+	int sectornr;
+	int ret = 0;
+
+	/*
+	 * Allocate the pages for parity first, as P/Q pages will always be
+	 * needed for both full-stripe and sub-stripe writes.
+	 */
+	ret = alloc_rbio_parity_pages(rbio);
+	if (ret < 0)
+		return ret;
+
+	/* Full stripe write, can write the full stripe right now. */
+	if (rbio_is_full(rbio))
+		goto write;
+	/*
+	 * Now we're doing sub-stripe write, also need all data stripes to do
+	 * the full RMW.
+	 */
+	ret = alloc_rbio_data_pages(rbio);
+	if (ret < 0)
+		return ret;
+
+	atomic_set(&rbio->error, 0);
+	index_rbio_pages(rbio);
+
+	ret = rmw_read_and_wait(rbio);
+	if (ret < 0)
+		return ret;
+
+	/* Too many read errors, beyond our tolerance. */
+	if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
+		return ret;
+
+	/* Have read failures but under tolerance, needs recovery. */
+	if (rbio->faila >= 0 || rbio->failb >= 0) {
+		ret = recover_rbio(rbio);
+		if (ret < 0)
+			return ret;
+	}
+write:
+	/*
+	 * At this stage we're not allowed to add any new bios to the
+	 * bio list any more, anyone else that wants to change this stripe
+	 * needs to do their own rmw.
+	 */
+	spin_lock_irq(&rbio->bio_list_lock);
+	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
+	spin_unlock_irq(&rbio->bio_list_lock);
+
+	atomic_set(&rbio->error, 0);
+
+	index_rbio_pages(rbio);
+
+	/*
+	 * We don't cache full rbios because we're assuming
+	 * the higher layers are unlikely to use this area of
+	 * the disk again soon.  If they do use it again,
+	 * hopefully they will send another full bio.
+	 */
+	if (!rbio_is_full(rbio))
+		cache_rbio_pages(rbio);
+	else
+		clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+
+	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++)
+		generate_pq_vertical(rbio, sectornr);
+
+	bio_list_init(&bio_list);
+	ret = rmw_assemble_write_bios(rbio, &bio_list);
+	if (ret < 0)
+		return ret;
+
+	/* We should have at least one bio assembled. */
+	ASSERT(bio_list_size(&bio_list));
+	submit_write_bios(rbio, &bio_list);
+	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
+
+	/* We have more errors than our tolerance during the read. */
+	if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
+		ret = -EIO;
+	return ret;
+}
+
 static void rmw_work(struct work_struct *work)
 {
 	struct btrfs_raid_bio *rbio;
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 445e833fcfcf..0e77c77c5dba 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -185,4 +185,9 @@ void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio);
 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
 
+/*
+ * Placeholder definition to avoid warning, will be removed when
+ * the full write path is migrated.
+ */
+int rmw_rbio(struct btrfs_raid_bio *rbio);
 #endif
-- 
cgit 


From 93723095b5d54b923abf07459998bcb9bbac8ba6 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 1 Nov 2022 19:16:09 +0800
Subject: btrfs: raid56: switch write path to rmw_rbio()

This includes the following changes:

- Implement new raid_unplug() functions
  Now we don't need a workqueue to run the plug, as all our
  work is just queue rmw_rbio_work() call, which can be executed
  without sleep.

- Implement a rmw_rbio_work_locked() helper
  This is for unlock_stripe(), which is already holding the full stripe
  lock.

- Remove all the old functions
  This should already shows how complex the old functions are, as we
  ended up removing the following functions:

  * rmw_work()
  * validate_rbio_for_rmw()
  * raid56_rmw_end_io_work()
  * raid56_rmw_stripe()
  * full_stripe_write()
  * partial_stripe_write()
  * __raid56_parity_write()
  * run_plug()
  * unplug_work()
  * btrfs_raid_unplug()
  * rmw_work()
  * __raid56_parity_recover()
  * raid_recover_end_io_work()

- Unexport rmw_rbio()

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c | 350 +++++++-----------------------------------------------
 fs/btrfs/raid56.h |   5 -
 2 files changed, 42 insertions(+), 313 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 8fd633f01d9e..ffedbfde95e0 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -64,9 +64,9 @@ struct sector_ptr {
 	unsigned int uptodate:8;
 };
 
-static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
 static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
-static void rmw_work(struct work_struct *work);
+static void rmw_rbio_work(struct work_struct *work);
+static void rmw_rbio_work_locked(struct work_struct *work);
 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
 static void index_rbio_pages(struct btrfs_raid_bio *rbio);
@@ -816,7 +816,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 				start_async_work(next, recover_rbio_work_locked);
 			} else if (next->operation == BTRFS_RBIO_WRITE) {
 				steal_rbio(rbio, next);
-				start_async_work(next, rmw_work);
+				start_async_work(next, rmw_rbio_work_locked);
 			} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
 				steal_rbio(rbio, next);
 				start_async_work(next, scrub_parity_work);
@@ -1108,23 +1108,6 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
 	return 0;
 }
 
-/*
- * while we're doing the read/modify/write cycle, we could
- * have errors in reading pages off the disk.  This checks
- * for errors and if we're not able to read the page it'll
- * trigger parity reconstruction.  The rmw will be finished
- * after we've reconstructed the failed stripes
- */
-static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
-{
-	if (rbio->faila >= 0 || rbio->failb >= 0) {
-		BUG_ON(rbio->faila == rbio->real_stripes - 1);
-		__raid56_parity_recover(rbio);
-	} else {
-		finish_rmw(rbio);
-	}
-}
-
 static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
 {
 	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
@@ -1601,31 +1584,6 @@ static void raid56_bio_end_io(struct bio *bio)
 			   &rbio->end_io_work);
 }
 
-/*
- * End io handler for the read phase of the RMW cycle.  All the bios here are
- * physical stripe bios we've read from the disk so we can recalculate the
- * parity of the stripe.
- *
- * This will usually kick off finish_rmw once all the bios are read in, but it
- * may trigger parity reconstruction if we had any errors along the way
- */
-static void raid56_rmw_end_io_work(struct work_struct *work)
-{
-	struct btrfs_raid_bio *rbio =
-		container_of(work, struct btrfs_raid_bio, end_io_work);
-
-	if (atomic_read(&rbio->error) > rbio->bioc->max_errors) {
-		rbio_orig_end_io(rbio, BLK_STS_IOERR);
-		return;
-	}
-
-	/*
-	 * This will normally call finish_rmw to start our write but if there
-	 * are any failed stripes we'll reconstruct from parity first.
-	 */
-	validate_rbio_for_rmw(rbio);
-}
-
 static int rmw_assemble_read_bios(struct btrfs_raid_bio *rbio,
 				  struct bio_list *bio_list)
 {
@@ -1686,122 +1644,6 @@ static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio)
 	return 0;
 }
 
-/*
- * the stripe must be locked by the caller.  It will
- * unlock after all the writes are done
- */
-static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
-{
-	int bios_to_read = 0;
-	struct bio_list bio_list;
-	int ret;
-	struct bio *bio;
-
-	bio_list_init(&bio_list);
-
-	ret = alloc_rbio_pages(rbio);
-	if (ret)
-		goto cleanup;
-
-	index_rbio_pages(rbio);
-
-	atomic_set(&rbio->error, 0);
-
-	ret = rmw_assemble_read_bios(rbio, &bio_list);
-	if (ret < 0)
-		goto cleanup;
-
-	bios_to_read = bio_list_size(&bio_list);
-	if (!bios_to_read) {
-		/*
-		 * this can happen if others have merged with
-		 * us, it means there is nothing left to read.
-		 * But if there are missing devices it may not be
-		 * safe to do the full stripe write yet.
-		 */
-		goto finish;
-	}
-
-	/*
-	 * The bioc may be freed once we submit the last bio. Make sure not to
-	 * touch it after that.
-	 */
-	atomic_set(&rbio->stripes_pending, bios_to_read);
-	INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work);
-	while ((bio = bio_list_pop(&bio_list))) {
-		bio->bi_end_io = raid56_bio_end_io;
-
-		if (trace_raid56_read_partial_enabled()) {
-			struct raid56_bio_trace_info trace_info = { 0 };
-
-			bio_get_trace_info(rbio, bio, &trace_info);
-			trace_raid56_read_partial(rbio, bio, &trace_info);
-		}
-		submit_bio(bio);
-	}
-	/* the actual write will happen once the reads are done */
-	return 0;
-
-cleanup:
-	rbio_orig_end_io(rbio, BLK_STS_IOERR);
-
-	while ((bio = bio_list_pop(&bio_list)))
-		bio_put(bio);
-
-	return -EIO;
-
-finish:
-	validate_rbio_for_rmw(rbio);
-	return 0;
-}
-
-/*
- * if the upper layers pass in a full stripe, we thank them by only allocating
- * enough pages to hold the parity, and sending it all down quickly.
- */
-static int full_stripe_write(struct btrfs_raid_bio *rbio)
-{
-	int ret;
-
-	ret = alloc_rbio_parity_pages(rbio);
-	if (ret)
-		return ret;
-
-	ret = lock_stripe_add(rbio);
-	if (ret == 0)
-		finish_rmw(rbio);
-	return 0;
-}
-
-/*
- * partial stripe writes get handed over to async helpers.
- * We're really hoping to merge a few more writes into this
- * rbio before calculating new parity
- */
-static int partial_stripe_write(struct btrfs_raid_bio *rbio)
-{
-	int ret;
-
-	ret = lock_stripe_add(rbio);
-	if (ret == 0)
-		start_async_work(rbio, rmw_work);
-	return 0;
-}
-
-/*
- * sometimes while we were reading from the drive to
- * recalculate parity, enough new bios come into create
- * a full stripe.  So we do a check here to see if we can
- * go directly to finish_rmw
- */
-static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
-{
-	/* head off into rmw land if we don't have a full stripe */
-	if (!rbio_is_full(rbio))
-		return partial_stripe_write(rbio);
-	return full_stripe_write(rbio);
-}
-
 /*
  * We use plugging call backs to collect full stripes.
  * Any time we get a partial stripe write while plugged
@@ -1836,28 +1678,22 @@ static int plug_cmp(void *priv, const struct list_head *a,
 	return 0;
 }
 
-static void run_plug(struct btrfs_plug_cb *plug)
+static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
 {
+	struct btrfs_plug_cb *plug = container_of(cb, struct btrfs_plug_cb, cb);
 	struct btrfs_raid_bio *cur;
 	struct btrfs_raid_bio *last = NULL;
 
-	/*
-	 * sort our plug list then try to merge
-	 * everything we can in hopes of creating full
-	 * stripes.
-	 */
 	list_sort(NULL, &plug->rbio_list, plug_cmp);
+
 	while (!list_empty(&plug->rbio_list)) {
 		cur = list_entry(plug->rbio_list.next,
 				 struct btrfs_raid_bio, plug_list);
 		list_del_init(&cur->plug_list);
 
 		if (rbio_is_full(cur)) {
-			int ret;
-
-			/* we have a full stripe, send it down */
-			ret = full_stripe_write(cur);
-			BUG_ON(ret);
+			/* We have a full stripe, queue it down. */
+			start_async_work(cur, rmw_rbio_work);
 			continue;
 		}
 		if (last) {
@@ -1865,42 +1701,16 @@ static void run_plug(struct btrfs_plug_cb *plug)
 				merge_rbio(last, cur);
 				free_raid_bio(cur);
 				continue;
-
 			}
-			__raid56_parity_write(last);
+			start_async_work(last, rmw_rbio_work);
 		}
 		last = cur;
 	}
-	if (last) {
-		__raid56_parity_write(last);
-	}
+	if (last)
+		start_async_work(last, rmw_rbio_work);
 	kfree(plug);
 }
 
-/*
- * if the unplug comes from schedule, we have to push the
- * work off to a helper thread
- */
-static void unplug_work(struct work_struct *work)
-{
-	struct btrfs_plug_cb *plug;
-	plug = container_of(work, struct btrfs_plug_cb, work);
-	run_plug(plug);
-}
-
-static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
-{
-	struct btrfs_plug_cb *plug;
-	plug = container_of(cb, struct btrfs_plug_cb, cb);
-
-	if (from_schedule) {
-		INIT_WORK(&plug->work, unplug_work);
-		queue_work(plug->info->rmw_workers, &plug->work);
-		return;
-	}
-	run_plug(plug);
-}
-
 /* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
 static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
 {
@@ -1948,19 +1758,13 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
 	rbio_add_bio(rbio, bio);
 
 	/*
-	 * don't plug on full rbios, just get them out the door
+	 * Don't plug on full rbios, just get them out the door
 	 * as quickly as we can
 	 */
-	if (rbio_is_full(rbio)) {
-		ret = full_stripe_write(rbio);
-		if (ret) {
-			free_raid_bio(rbio);
-			goto fail;
-		}
-		return;
-	}
+	if (rbio_is_full(rbio))
+		goto queue_rbio;
 
-	cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
+	cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug));
 	if (cb) {
 		plug = container_of(cb, struct btrfs_plug_cb, cb);
 		if (!plug->info) {
@@ -1968,13 +1772,14 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
 			INIT_LIST_HEAD(&plug->rbio_list);
 		}
 		list_add_tail(&rbio->plug_list, &plug->rbio_list);
-	} else {
-		ret = __raid56_parity_write(rbio);
-		if (ret) {
-			free_raid_bio(rbio);
-			goto fail;
-		}
+		return;
 	}
+queue_rbio:
+	/*
+	 * Either we don't have any existing plug, or we're doing a full stripe,
+	 * can queue the rmw work now.
+	 */
+	start_async_work(rbio, rmw_rbio_work);
 
 	return;
 
@@ -2217,21 +2022,6 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
 	}
 }
 
-/*
- * This is called only for stripes we've read from disk to reconstruct the
- * parity.
- */
-static void raid_recover_end_io_work(struct work_struct *work)
-{
-	struct btrfs_raid_bio *rbio =
-		container_of(work, struct btrfs_raid_bio, end_io_work);
-
-	if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
-		rbio_orig_end_io(rbio, BLK_STS_IOERR);
-	else
-		__raid_recover_end_io(rbio);
-}
-
 static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio,
 				      struct bio_list *bio_list)
 {
@@ -2348,79 +2138,6 @@ static void recover_rbio_work_locked(struct work_struct *work)
 	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
 }
 
-/*
- * reads everything we need off the disk to reconstruct
- * the parity. endio handlers trigger final reconstruction
- * when the IO is done.
- *
- * This is used both for reads from the higher layers and for
- * parity construction required to finish a rmw cycle.
- */
-static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
-{
-	int bios_to_read = 0;
-	struct bio_list bio_list;
-	int ret;
-	struct bio *bio;
-
-	bio_list_init(&bio_list);
-
-	ret = alloc_rbio_pages(rbio);
-	if (ret)
-		goto cleanup;
-
-	atomic_set(&rbio->error, 0);
-
-	ret = recover_assemble_read_bios(rbio, &bio_list);
-	if (ret < 0)
-		goto cleanup;
-
-	bios_to_read = bio_list_size(&bio_list);
-	if (!bios_to_read) {
-		/*
-		 * we might have no bios to read just because the pages
-		 * were up to date, or we might have no bios to read because
-		 * the devices were gone.
-		 */
-		if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) {
-			__raid_recover_end_io(rbio);
-			return 0;
-		} else {
-			goto cleanup;
-		}
-	}
-
-	/*
-	 * The bioc may be freed once we submit the last bio. Make sure not to
-	 * touch it after that.
-	 */
-	atomic_set(&rbio->stripes_pending, bios_to_read);
-	INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work);
-	while ((bio = bio_list_pop(&bio_list))) {
-		bio->bi_end_io = raid56_bio_end_io;
-
-		if (trace_raid56_scrub_read_recover_enabled()) {
-			struct raid56_bio_trace_info trace_info = { 0 };
-
-			bio_get_trace_info(rbio, bio, &trace_info);
-			trace_raid56_scrub_read_recover(rbio, bio, &trace_info);
-		}
-		submit_bio(bio);
-	}
-
-	return 0;
-
-cleanup:
-	if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
-	    rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
-		rbio_orig_end_io(rbio, BLK_STS_IOERR);
-
-	while ((bio = bio_list_pop(&bio_list)))
-		bio_put(bio);
-
-	return -EIO;
-}
-
 /*
  * the main entry point for reads from the higher layers.  This
  * is really only called when the normal read path had a failure,
@@ -2529,7 +2246,7 @@ static void submit_write_bios(struct btrfs_raid_bio *rbio,
 	}
 }
 
-int rmw_rbio(struct btrfs_raid_bio *rbio)
+static int rmw_rbio(struct btrfs_raid_bio *rbio)
 {
 	struct bio_list bio_list;
 	int sectornr;
@@ -2615,12 +2332,29 @@ write:
 	return ret;
 }
 
-static void rmw_work(struct work_struct *work)
+static void rmw_rbio_work(struct work_struct *work)
+{
+	struct btrfs_raid_bio *rbio;
+	int ret;
+
+	rbio = container_of(work, struct btrfs_raid_bio, work);
+
+	ret = lock_stripe_add(rbio);
+	if (ret == 0) {
+		ret = rmw_rbio(rbio);
+		rbio_orig_end_io(rbio, errno_to_blk_status(ret));
+	}
+}
+
+static void rmw_rbio_work_locked(struct work_struct *work)
 {
 	struct btrfs_raid_bio *rbio;
+	int ret;
 
 	rbio = container_of(work, struct btrfs_raid_bio, work);
-	raid56_rmw_stripe(rbio);
+
+	ret = rmw_rbio(rbio);
+	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
 }
 
 /*
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 0e77c77c5dba..445e833fcfcf 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -185,9 +185,4 @@ void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio);
 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
 
-/*
- * Placeholder definition to avoid warning, will be removed when
- * the full write path is migrated.
- */
-int rmw_rbio(struct btrfs_raid_bio *rbio);
 #endif
-- 
cgit 


From cb3450b7d7d0af6ed6ff60e174129938914083ab Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 1 Nov 2022 19:16:10 +0800
Subject: btrfs: raid56: extract scrub read bio list assembly code into a
 helper

Just like what we did for write/recovery, also extract the read bio
assembly code into a helper for scrub.

The difference between the three are:

- rmw_assemble_read_bios() only submit reads for missing sectors
  Thus it will skip cached sectors, but will also read sectors which
  is not covered by any full stripe. (For cache usage)

- recover_assemble_read_bios() reads every sector which has not failed

- scrub_assemble_read_bios() has extra check for vertical stripes
  It's mostly the same as rmw_assemble_read_bios(), but will skip
  sectors which is not covered by a vertical stripe.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c | 44 +++++++++++++++++++++++++++++++-------------
 1 file changed, 31 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index ffedbfde95e0..fcac70ff5a78 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -2707,21 +2707,15 @@ static void raid56_parity_scrub_end_io_work(struct work_struct *work)
 	validate_rbio_for_parity_scrub(rbio);
 }
 
-static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
+static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio,
+				    struct bio_list *bio_list)
 {
-	int bios_to_read = 0;
-	struct bio_list bio_list;
-	int ret;
-	int total_sector_nr;
 	struct bio *bio;
+	int total_sector_nr;
+	int ret = 0;
 
-	bio_list_init(&bio_list);
-
-	ret = alloc_rbio_essential_pages(rbio);
-	if (ret)
-		goto cleanup;
+	ASSERT(bio_list_size(bio_list) == 0);
 
-	atomic_set(&rbio->error, 0);
 	/* Build a list of bios to read all the missing parts. */
 	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
 	     total_sector_nr++) {
@@ -2750,11 +2744,35 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
 		if (sector->uptodate)
 			continue;
 
-		ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
+		ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
 					 sectornr, REQ_OP_READ);
 		if (ret)
-			goto cleanup;
+			goto error;
 	}
+	return 0;
+error:
+	while ((bio = bio_list_pop(bio_list)))
+		bio_put(bio);
+	return ret;
+}
+
+static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
+{
+	int bios_to_read = 0;
+	struct bio_list bio_list;
+	int ret;
+	struct bio *bio;
+
+	bio_list_init(&bio_list);
+
+	ret = alloc_rbio_essential_pages(rbio);
+	if (ret)
+		goto cleanup;
+
+	atomic_set(&rbio->error, 0);
+	ret = scrub_assemble_read_bios(rbio, &bio_list);
+	if (ret < 0)
+		goto cleanup;
 
 	bios_to_read = bio_list_size(&bio_list);
 	if (!bios_to_read) {
-- 
cgit 


From 6bfd0133bee27737db415c530617cb015274d21f Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 1 Nov 2022 19:16:11 +0800
Subject: btrfs: raid56: switch scrub path to use a single function

This switch involves the following changes:

- Make finish_parity_scrub() only to submit the write bios
  It will no longer call rbio_orig_end_io(), and now it will
  return error.

- Add a new helper, recover_scrub_rbio(), to handle recovery
  It's just doing extra scrub related checks, and then call
  recover_sectors().

- Rename raid56_parity_scrub_stripe() to scrub_rbio()
- Rename scrub_parity_work() to scrub_rbio_work_locked()
  To follow the existing naming scheme.

- Delete unused functions
  Including:
  * finish_rmw()
  * raid_write_end_io()
  * raid56_bio_end_io()
  * __raid_recover_end_io()

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c | 397 +++++++++++-------------------------------------------
 1 file changed, 79 insertions(+), 318 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index fcac70ff5a78..629237102189 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -64,7 +64,6 @@ struct sector_ptr {
 	unsigned int uptodate:8;
 };
 
-static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
 static void rmw_rbio_work(struct work_struct *work);
 static void rmw_rbio_work_locked(struct work_struct *work);
 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
@@ -72,9 +71,8 @@ static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
 static void index_rbio_pages(struct btrfs_raid_bio *rbio);
 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
 
-static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
-					 int need_check);
-static void scrub_parity_work(struct work_struct *work);
+static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check);
+static void scrub_rbio_work_locked(struct work_struct *work);
 
 static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio)
 {
@@ -819,7 +817,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 				start_async_work(next, rmw_rbio_work_locked);
 			} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
 				steal_rbio(rbio, next);
-				start_async_work(next, scrub_parity_work);
+				start_async_work(next, scrub_rbio_work_locked);
 			}
 
 			goto done_nolock;
@@ -880,35 +878,6 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
 		rbio_endio_bio_list(extra, err);
 }
 
-/*
- * end io function used by finish_rmw.  When we finally
- * get here, we've written a full stripe
- */
-static void raid_write_end_io(struct bio *bio)
-{
-	struct btrfs_raid_bio *rbio = bio->bi_private;
-	blk_status_t err = bio->bi_status;
-	int max_errors;
-
-	if (err)
-		fail_bio_stripe(rbio, bio);
-
-	bio_put(bio);
-
-	if (!atomic_dec_and_test(&rbio->stripes_pending))
-		return;
-
-	err = BLK_STS_OK;
-
-	/* OK, we have read all the stripes we need to. */
-	max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
-		     0 : rbio->bioc->max_errors;
-	if (atomic_read(&rbio->error) > max_errors)
-		err = BLK_STS_IOERR;
-
-	rbio_orig_end_io(rbio, err);
-}
-
 /*
  * Get a sector pointer specified by its @stripe_nr and @sector_nr.
  *
@@ -1319,87 +1288,6 @@ error:
 	return -EIO;
 }
 
-/*
- * this is called from one of two situations.  We either
- * have a full stripe from the higher layers, or we've read all
- * the missing bits off disk.
- *
- * This will calculate the parity and then send down any
- * changed blocks.
- */
-static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
-{
-	/* The total sector number inside the full stripe. */
-	/* Sector number inside a stripe. */
-	int sectornr;
-	struct bio_list bio_list;
-	struct bio *bio;
-	int ret;
-
-	bio_list_init(&bio_list);
-
-	/* We should have at least one data sector. */
-	ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
-
-	/* at this point we either have a full stripe,
-	 * or we've read the full stripe from the drive.
-	 * recalculate the parity and write the new results.
-	 *
-	 * We're not allowed to add any new bios to the
-	 * bio list here, anyone else that wants to
-	 * change this stripe needs to do their own rmw.
-	 */
-	spin_lock_irq(&rbio->bio_list_lock);
-	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
-	spin_unlock_irq(&rbio->bio_list_lock);
-
-	atomic_set(&rbio->error, 0);
-
-	/*
-	 * now that we've set rmw_locked, run through the
-	 * bio list one last time and map the page pointers
-	 *
-	 * We don't cache full rbios because we're assuming
-	 * the higher layers are unlikely to use this area of
-	 * the disk again soon.  If they do use it again,
-	 * hopefully they will send another full bio.
-	 */
-	index_rbio_pages(rbio);
-	if (!rbio_is_full(rbio))
-		cache_rbio_pages(rbio);
-	else
-		clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
-
-	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++)
-		generate_pq_vertical(rbio, sectornr);
-
-	ret = rmw_assemble_write_bios(rbio, &bio_list);
-	if (ret < 0)
-		goto cleanup;
-
-	atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
-	BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
-
-	while ((bio = bio_list_pop(&bio_list))) {
-		bio->bi_end_io = raid_write_end_io;
-
-		if (trace_raid56_write_stripe_enabled()) {
-			struct raid56_bio_trace_info trace_info = { 0 };
-
-			bio_get_trace_info(rbio, bio, &trace_info);
-			trace_raid56_write_stripe(rbio, bio, &trace_info);
-		}
-		submit_bio(bio);
-	}
-	return;
-
-cleanup:
-	rbio_orig_end_io(rbio, BLK_STS_IOERR);
-
-	while ((bio = bio_list_pop(&bio_list)))
-		bio_put(bio);
-}
-
 /*
  * helper to find the stripe number for a given bio.  Used to figure out which
  * stripe has failed.  This expects the bio to correspond to a physical disk,
@@ -1568,22 +1456,6 @@ static void submit_read_bios(struct btrfs_raid_bio *rbio,
 	}
 }
 
-static void raid56_bio_end_io(struct bio *bio)
-{
-	struct btrfs_raid_bio *rbio = bio->bi_private;
-
-	if (bio->bi_status)
-		fail_bio_stripe(rbio, bio);
-	else
-		set_bio_pages_uptodate(rbio, bio);
-
-	bio_put(bio);
-
-	if (atomic_dec_and_test(&rbio->stripes_pending))
-		queue_work(rbio->bioc->fs_info->endio_raid56_workers,
-			   &rbio->end_io_work);
-}
-
 static int rmw_assemble_read_bios(struct btrfs_raid_bio *rbio,
 				  struct bio_list *bio_list)
 {
@@ -1968,60 +1840,6 @@ out:
 	return ret;
 }
 
-/*
- * all parity reconstruction happens here.  We've read in everything
- * we can find from the drives and this does the heavy lifting of
- * sorting the good from the bad.
- */
-static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
-{
-	int ret;
-
-	ret = recover_sectors(rbio);
-
-	/*
-	 * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
-	 * valid rbio which is consistent with ondisk content, thus such a
-	 * valid rbio can be cached to avoid further disk reads.
-	 */
-	if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
-	    rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
-		/*
-		 * - In case of two failures, where rbio->failb != -1:
-		 *
-		 *   Do not cache this rbio since the above read reconstruction
-		 *   (raid6_datap_recov() or raid6_2data_recov()) may have
-		 *   changed some content of stripes which are not identical to
-		 *   on-disk content any more, otherwise, a later write/recover
-		 *   may steal stripe_pages from this rbio and end up with
-		 *   corruptions or rebuild failures.
-		 *
-		 * - In case of single failure, where rbio->failb == -1:
-		 *
-		 *   Cache this rbio iff the above read reconstruction is
-		 *   executed without problems.
-		 */
-		if (!ret && rbio->failb < 0)
-			cache_rbio_pages(rbio);
-		else
-			clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
-
-		rbio_orig_end_io(rbio, errno_to_blk_status(ret));
-	} else if (!ret) {
-		rbio->faila = -1;
-		rbio->failb = -1;
-
-		if (rbio->operation == BTRFS_RBIO_WRITE)
-			finish_rmw(rbio);
-		else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
-			finish_parity_scrub(rbio, 0);
-		else
-			BUG();
-	} else {
-		rbio_orig_end_io(rbio, errno_to_blk_status(ret));
-	}
-}
-
 static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio,
 				      struct bio_list *bio_list)
 {
@@ -2449,8 +2267,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
 	return 0;
 }
 
-static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
-					 int need_check)
+static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check)
 {
 	struct btrfs_io_context *bioc = rbio->bioc;
 	const u32 sectorsize = bioc->fs_info->sectorsize;
@@ -2493,7 +2310,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
 
 	p_sector.page = alloc_page(GFP_NOFS);
 	if (!p_sector.page)
-		goto cleanup;
+		return -ENOMEM;
 	p_sector.pgoff = 0;
 	p_sector.uptodate = 1;
 
@@ -2503,7 +2320,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
 		if (!q_sector.page) {
 			__free_page(p_sector.page);
 			p_sector.page = NULL;
-			goto cleanup;
+			return -ENOMEM;
 		}
 		q_sector.pgoff = 0;
 		q_sector.uptodate = 1;
@@ -2590,33 +2407,13 @@ writeback:
 	}
 
 submit_write:
-	nr_data = bio_list_size(&bio_list);
-	if (!nr_data) {
-		/* Every parity is right */
-		rbio_orig_end_io(rbio, BLK_STS_OK);
-		return;
-	}
-
-	atomic_set(&rbio->stripes_pending, nr_data);
-
-	while ((bio = bio_list_pop(&bio_list))) {
-		bio->bi_end_io = raid_write_end_io;
-
-		if (trace_raid56_scrub_write_stripe_enabled()) {
-			struct raid56_bio_trace_info trace_info = { 0 };
-
-			bio_get_trace_info(rbio, bio, &trace_info);
-			trace_raid56_scrub_write_stripe(rbio, bio, &trace_info);
-		}
-		submit_bio(bio);
-	}
-	return;
+	submit_write_bios(rbio, &bio_list);
+	return 0;
 
 cleanup:
-	rbio_orig_end_io(rbio, BLK_STS_IOERR);
-
 	while ((bio = bio_list_pop(&bio_list)))
 		bio_put(bio);
+	return ret;
 }
 
 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
@@ -2626,85 +2423,51 @@ static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
 	return 0;
 }
 
-/*
- * While we're doing the parity check and repair, we could have errors
- * in reading pages off the disk.  This checks for errors and if we're
- * not able to read the page it'll trigger parity reconstruction.  The
- * parity scrub will be finished after we've reconstructed the failed
- * stripes
- */
-static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
+static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
 {
-	if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
-		goto cleanup;
-
-	if (rbio->faila >= 0 || rbio->failb >= 0) {
-		int dfail = 0, failp = -1;
-
-		if (is_data_stripe(rbio, rbio->faila))
-			dfail++;
-		else if (is_parity_stripe(rbio->faila))
-			failp = rbio->faila;
-
-		if (is_data_stripe(rbio, rbio->failb))
-			dfail++;
-		else if (is_parity_stripe(rbio->failb))
-			failp = rbio->failb;
-
-		/*
-		 * Because we can not use a scrubbing parity to repair
-		 * the data, so the capability of the repair is declined.
-		 * (In the case of RAID5, we can not repair anything)
-		 */
-		if (dfail > rbio->bioc->max_errors - 1)
-			goto cleanup;
+	int dfail = 0, failp = -1;
+	int ret;
 
-		/*
-		 * If all data is good, only parity is correctly, just
-		 * repair the parity.
-		 */
-		if (dfail == 0) {
-			finish_parity_scrub(rbio, 0);
-			return;
-		}
+	/* No error case should be already handled by the caller. */
+	ASSERT(rbio->faila >= 0 || rbio->failb >= 0);
 
-		/*
-		 * Here means we got one corrupted data stripe and one
-		 * corrupted parity on RAID6, if the corrupted parity
-		 * is scrubbing parity, luckily, use the other one to repair
-		 * the data, or we can not repair the data stripe.
-		 */
-		if (failp != rbio->scrubp)
-			goto cleanup;
+	if (is_data_stripe(rbio, rbio->faila))
+		dfail++;
+	else if (is_parity_stripe(rbio->faila))
+		failp = rbio->faila;
 
-		__raid_recover_end_io(rbio);
-	} else {
-		finish_parity_scrub(rbio, 1);
-	}
-	return;
+	if (is_data_stripe(rbio, rbio->failb))
+		dfail++;
+	else if (is_parity_stripe(rbio->failb))
+		failp = rbio->failb;
 
-cleanup:
-	rbio_orig_end_io(rbio, BLK_STS_IOERR);
-}
+	/*
+	 * Because we can not use a scrubbing parity to repair
+	 * the data, so the capability of the repair is declined.
+	 * (In the case of RAID5, we can not repair anything)
+	 */
+	if (dfail > rbio->bioc->max_errors - 1)
+		return -EIO;
 
-/*
- * end io for the read phase of the rmw cycle.  All the bios here are physical
- * stripe bios we've read from the disk so we can recalculate the parity of the
- * stripe.
- *
- * This will usually kick off finish_rmw once all the bios are read in, but it
- * may trigger parity reconstruction if we had any errors along the way
- */
-static void raid56_parity_scrub_end_io_work(struct work_struct *work)
-{
-	struct btrfs_raid_bio *rbio =
-		container_of(work, struct btrfs_raid_bio, end_io_work);
+	/*
+	 * If all data is good, only parity is correctly, just
+	 * repair the parity.
+	 */
+	if (dfail == 0)
+		return 0;
 
 	/*
-	 * This will normally call finish_rmw to start our write, but if there
-	 * are any failed stripes we'll reconstruct from parity first
+	 * Here means we got one corrupted data stripe and one
+	 * corrupted parity on RAID6, if the corrupted parity
+	 * is scrubbing parity, luckily, use the other one to repair
+	 * the data, or we can not repair the data stripe.
 	 */
-	validate_rbio_for_parity_scrub(rbio);
+	if (failp != rbio->scrubp)
+		return -EIO;
+
+	/* We have some corrupted sectors, need to repair them. */
+	ret = recover_sectors(rbio);
+	return ret;
 }
 
 static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio,
@@ -2756,9 +2519,9 @@ error:
 	return ret;
 }
 
-static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
+static int scrub_rbio(struct btrfs_raid_bio *rbio)
 {
-	int bios_to_read = 0;
+	bool need_check = false;
 	struct bio_list bio_list;
 	int ret;
 	struct bio *bio;
@@ -2774,61 +2537,59 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
 	if (ret < 0)
 		goto cleanup;
 
-	bios_to_read = bio_list_size(&bio_list);
-	if (!bios_to_read) {
-		/*
-		 * this can happen if others have merged with
-		 * us, it means there is nothing left to read.
-		 * But if there are missing devices it may not be
-		 * safe to do the full stripe write yet.
-		 */
-		goto finish;
-	}
+	submit_read_bios(rbio, &bio_list);
+	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
 
+	if (atomic_read(&rbio->error) > rbio->bioc->max_errors) {
+		ret = -EIO;
+		goto cleanup;
+	}
 	/*
-	 * The bioc may be freed once we submit the last bio. Make sure not to
-	 * touch it after that.
+	 * No error during read, can finish the scrub and need to verify the
+	 * P/Q sectors;
 	 */
-	atomic_set(&rbio->stripes_pending, bios_to_read);
-	INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work);
-	while ((bio = bio_list_pop(&bio_list))) {
-		bio->bi_end_io = raid56_bio_end_io;
+	if (atomic_read(&rbio->error) == 0) {
+		need_check = true;
+		goto finish;
+	}
 
-		if (trace_raid56_scrub_read_enabled()) {
-			struct raid56_bio_trace_info trace_info = { 0 };
+	/* We have some failures, need to recover the failed sectors first. */
+	ret = recover_scrub_rbio(rbio);
+	if (ret < 0)
+		goto cleanup;
 
-			bio_get_trace_info(rbio, bio, &trace_info);
-			trace_raid56_scrub_read(rbio, bio, &trace_info);
-		}
-		submit_bio(bio);
-	}
-	/* the actual write will happen once the reads are done */
-	return;
+finish:
+	/*
+	 * We have every sector properly prepared. Can finish the scrub
+	 * and writeback the good content.
+	 */
+	ret = finish_parity_scrub(rbio, need_check);
+	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
+	if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
+		ret = -EIO;
+	return ret;
 
 cleanup:
-	rbio_orig_end_io(rbio, BLK_STS_IOERR);
-
 	while ((bio = bio_list_pop(&bio_list)))
 		bio_put(bio);
 
-	return;
-
-finish:
-	validate_rbio_for_parity_scrub(rbio);
+	return ret;
 }
 
-static void scrub_parity_work(struct work_struct *work)
+static void scrub_rbio_work_locked(struct work_struct *work)
 {
 	struct btrfs_raid_bio *rbio;
+	int ret;
 
 	rbio = container_of(work, struct btrfs_raid_bio, work);
-	raid56_parity_scrub_stripe(rbio);
+	ret = scrub_rbio(rbio);
+	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
 }
 
 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
 {
 	if (!lock_stripe_add(rbio))
-		start_async_work(rbio, scrub_parity_work);
+		start_async_work(rbio, scrub_rbio_work_locked);
 }
 
 /* The following code is used for dev replace of a missing RAID 5/6 device. */
-- 
cgit 


From 1a1a285139707f2430b3833103e152f07c66e63e Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 1 Nov 2022 19:16:12 +0800
Subject: btrfs: remove the unused endio_raid56_workers and
 btrfs_raid_bio::end_io_work

Since we have switched all raid56 workload to submit-and-wait method,
there is no use for btrfs_fs_info::endio_raid56_workers workqueue and
btrfs_raid_bio::end_io_work.

Remove them to save some memory.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 6 +-----
 fs/btrfs/fs.h      | 1 -
 fs/btrfs/raid56.h  | 2 --
 3 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f5f793af12a0..e5687bdff150 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2097,8 +2097,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
 	btrfs_destroy_workqueue(fs_info->workers);
 	if (fs_info->endio_workers)
 		destroy_workqueue(fs_info->endio_workers);
-	if (fs_info->endio_raid56_workers)
-		destroy_workqueue(fs_info->endio_raid56_workers);
 	if (fs_info->rmw_workers)
 		destroy_workqueue(fs_info->rmw_workers);
 	if (fs_info->compressed_write_workers)
@@ -2304,8 +2302,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
 		alloc_workqueue("btrfs-endio", flags, max_active);
 	fs_info->endio_meta_workers =
 		alloc_workqueue("btrfs-endio-meta", flags, max_active);
-	fs_info->endio_raid56_workers =
-		alloc_workqueue("btrfs-endio-raid56", flags, max_active);
 	fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
 	fs_info->endio_write_workers =
 		btrfs_alloc_workqueue(fs_info, "endio-write", flags,
@@ -2327,7 +2323,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
 	      fs_info->delalloc_workers && fs_info->flush_workers &&
 	      fs_info->endio_workers && fs_info->endio_meta_workers &&
 	      fs_info->compressed_write_workers &&
-	      fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
+	      fs_info->endio_write_workers &&
 	      fs_info->endio_freespace_worker && fs_info->rmw_workers &&
 	      fs_info->caching_workers && fs_info->fixup_workers &&
 	      fs_info->delayed_workers && fs_info->qgroup_rescan_workers &&
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index c7f2a512fba2..a749367e5ae2 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -532,7 +532,6 @@ struct btrfs_fs_info {
 	struct btrfs_workqueue *flush_workers;
 	struct workqueue_struct *endio_workers;
 	struct workqueue_struct *endio_meta_workers;
-	struct workqueue_struct *endio_raid56_workers;
 	struct workqueue_struct *rmw_workers;
 	struct workqueue_struct *compressed_write_workers;
 	struct btrfs_workqueue *endio_write_workers;
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 445e833fcfcf..9fae97b7a2a5 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -97,8 +97,6 @@ struct btrfs_raid_bio {
 
 	wait_queue_head_t io_wait;
 
-	struct work_struct end_io_work;
-
 	/* Bitmap to record which horizontal stripe has data */
 	unsigned long dbitmap;
 
-- 
cgit 


From 61ce908a3c260fbe37826a6cbd56636abeffcd28 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 1 Nov 2022 16:15:41 +0000
Subject: btrfs: send: avoid unnecessary path allocations when finding extent
 clone

When looking for an extent clone, at find_extent_clone(), we start by
allocating a path and then check for cases where we can't have clones
and exit immediately in those cases. It's a waste of time to allocate
the path before those cases, so reorder the logic so that we check for
those cases before allocating the path.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c | 37 ++++++++++++++++---------------------
 1 file changed, 16 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 3befc0d2d866..d1388d5dc951 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1365,40 +1365,35 @@ static int find_extent_clone(struct send_ctx *sctx,
 	int compressed;
 	u32 i;
 
-	tmp_path = alloc_path_for_send();
-	if (!tmp_path)
-		return -ENOMEM;
-
-	/* We only use this path under the commit sem */
-	tmp_path->need_commit_sem = 0;
-
 	if (data_offset >= ino_size) {
 		/*
 		 * There may be extents that lie behind the file's size.
 		 * I at least had this in combination with snapshotting while
 		 * writing large files.
 		 */
-		ret = 0;
-		goto out;
+		return 0;
 	}
 
-	fi = btrfs_item_ptr(eb, path->slots[0],
-			struct btrfs_file_extent_item);
+	fi = btrfs_item_ptr(eb, path->slots[0], struct btrfs_file_extent_item);
 	extent_type = btrfs_file_extent_type(eb, fi);
-	if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
-		ret = -ENOENT;
-		goto out;
-	}
-	compressed = btrfs_file_extent_compression(eb, fi);
+	if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+		return -ENOENT;
 
-	num_bytes = btrfs_file_extent_num_bytes(eb, fi);
 	disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
-	if (disk_byte == 0) {
-		ret = -ENOENT;
-		goto out;
-	}
+	if (disk_byte == 0)
+		return -ENOENT;
+
+	compressed = btrfs_file_extent_compression(eb, fi);
+	num_bytes = btrfs_file_extent_num_bytes(eb, fi);
 	logical = disk_byte + btrfs_file_extent_offset(eb, fi);
 
+	tmp_path = alloc_path_for_send();
+	if (!tmp_path)
+		return -ENOMEM;
+
+	/* We only use this path under the commit sem */
+	tmp_path->need_commit_sem = 0;
+
 	down_read(&fs_info->commit_root_sem);
 	ret = extent_from_logical(fs_info, disk_byte, tmp_path,
 				  &found_key, &flags);
-- 
cgit 


From d3f41317f0fedac6b0a3de68b3dab66b71923dd8 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 1 Nov 2022 16:15:42 +0000
Subject: btrfs: send: update comment at find_extent_clone()

We have this unclear comment at find_extent_clone() about extents starting
at a file offset greater than or equals to the i_size of the inode. It's
not really informative and it's misleading, since it mentions the author
found such extents with snapshots and large files.

Such extents are a result of fallocate with FALLOC_FL_KEEP_SIZE and there
is no relation to snapshots or large files (all write paths update the
i_size before inserting a new file extent item). So update the comment to
be precise about it and why we don't bother looking for clone sources in
that case.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index d1388d5dc951..4ce5c154f6d7 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1365,14 +1365,14 @@ static int find_extent_clone(struct send_ctx *sctx,
 	int compressed;
 	u32 i;
 
-	if (data_offset >= ino_size) {
-		/*
-		 * There may be extents that lie behind the file's size.
-		 * I at least had this in combination with snapshotting while
-		 * writing large files.
-		 */
+	/*
+	 * With fallocate we can get prealloc extents beyond the inode's i_size,
+	 * so we don't do anything here because clone operations can not clone
+	 * to a range beyond i_size without increasing the i_size of the
+	 * destination inode.
+	 */
+	if (data_offset >= ino_size)
 		return 0;
-	}
 
 	fi = btrfs_item_ptr(eb, path->slots[0], struct btrfs_file_extent_item);
 	extent_type = btrfs_file_extent_type(eb, fi);
-- 
cgit 


From 344174a1a68a55599d3a264db47d96583ff66473 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 1 Nov 2022 16:15:43 +0000
Subject: btrfs: send: drop unnecessary backref context field initializations

At find_extent_clone() we are initializing to zero the 'found_itself' and
'found' fields of the backref context before we use it but we have already
initialized the structure to zeroes when we declared it on stack, so it's
pointless to initialize those fields and they are unnecessarily increasing
the object text size with two "mov" instructions (x86_64).

Similarly make the 'extent_len' initialization more clear by using an if-
-then-else instead of a double assignment to it in case the extent's end
crosses the i_size boundary.

Before this change:

   $ size fs/btrfs/send.o
      text	   data	    bss	    dec	    hex	filename
     68694	   4252	     16	  72962	  11d02	fs/btrfs/send.o

After this change:

   $ size fs/btrfs/send.o
      text	   data	    bss	    dec	    hex	filename
     68678	   4252	     16	  72946	  11cf2	fs/btrfs/send.o

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 4ce5c154f6d7..7d289bdc6de1 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1432,11 +1432,8 @@ static int find_extent_clone(struct send_ctx *sctx,
 	}
 
 	backref_ctx.sctx = sctx;
-	backref_ctx.found = 0;
 	backref_ctx.cur_objectid = ino;
 	backref_ctx.cur_offset = data_offset;
-	backref_ctx.found_itself = 0;
-	backref_ctx.extent_len = num_bytes;
 
 	/*
 	 * The last extent of a file may be too large due to page alignment.
@@ -1445,6 +1442,8 @@ static int find_extent_clone(struct send_ctx *sctx,
 	 */
 	if (data_offset + num_bytes >= ino_size)
 		backref_ctx.extent_len = ino_size - data_offset;
+	else
+		backref_ctx.extent_len = num_bytes;
 
 	/*
 	 * Now collect all backrefs.
-- 
cgit 


From 22a3c0ac8ed0043af209a15928ae4c4855b0a4c4 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 1 Nov 2022 16:15:44 +0000
Subject: btrfs: send: avoid unnecessary backref lookups when finding clone
 source

At find_extent_clone(), unless we are given an inline extent, a file
extent item that represents hole or an extent that starts beyond the
i_size, we always do backref walking to look for clone sources, unless
if we have more than SEND_MAX_EXTENT_REFS (64) known references on the
extent.

However if we know we only have one reference in the extent item and only
one clone source (the send root), then it's pointless to do the backref
walking to search for clone sources, as we can't clone from any other
root. So skip the backref walking in that case.

The following test was run on a non-debug kernel (Debian's default kernel
config):

   $ cat test.sh
   #!/bin/bash

   DEV=/dev/sdi
   MNT=/mnt/sdi

   mkfs.btrfs -f $DEV
   mount $DEV $MNT

   # Create an extent tree that's not too small and none of the
   # extents is shared.
   for ((i = 1; i <= 50000; i++)); do
      xfs_io -f -c "pwrite 0 4K" $MNT/file_$i > /dev/null
      echo -ne "\r$i files created..."
   done
   echo

   btrfs subvolume snapshot -r $MNT $MNT/snap

   start=$(date +%s%N)
   btrfs send $MNT/snap > /dev/null
   end=$(date +%s%N)

   dur=$(( (end - start) / 1000000 ))
   echo -e "\nsend took $dur milliseconds"

   umount $MNT

Before this change:

   send took 5389 milliseconds

After this change:

   send took 4519 milliseconds  (-16.1%)

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 7d289bdc6de1..d9e01b66b32b 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1354,6 +1354,7 @@ static int find_extent_clone(struct send_ctx *sctx,
 	u64 disk_byte;
 	u64 num_bytes;
 	u64 extent_item_pos;
+	u64 extent_refs;
 	u64 flags = 0;
 	struct btrfs_file_extent_item *fi;
 	struct extent_buffer *eb = path->nodes[0];
@@ -1408,14 +1409,22 @@ static int find_extent_clone(struct send_ctx *sctx,
 
 	ei = btrfs_item_ptr(tmp_path->nodes[0], tmp_path->slots[0],
 			    struct btrfs_extent_item);
+	extent_refs = btrfs_extent_refs(tmp_path->nodes[0], ei);
 	/*
 	 * Backreference walking (iterate_extent_inodes() below) is currently
 	 * too expensive when an extent has a large number of references, both
 	 * in time spent and used memory. So for now just fallback to write
 	 * operations instead of clone operations when an extent has more than
 	 * a certain amount of references.
+	 *
+	 * Also, if we have only one reference and only the send root as a clone
+	 * source - meaning no clone roots were given in the struct
+	 * btrfs_ioctl_send_args passed to the send ioctl - then it's our
+	 * reference and there's no point in doing backref walking which is
+	 * expensive, so exit early.
 	 */
-	if (btrfs_extent_refs(tmp_path->nodes[0], ei) > SEND_MAX_EXTENT_REFS) {
+	if ((extent_refs == 1 && sctx->clone_roots_cnt == 1) ||
+	    extent_refs > SEND_MAX_EXTENT_REFS) {
 		ret = -ENOENT;
 		goto out;
 	}
-- 
cgit 


From c7499a64dcf62bf27968b0e9cce353c490b9ada1 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 1 Nov 2022 16:15:45 +0000
Subject: btrfs: send: optimize clone detection to increase extent sharing

Currently send does not do the best decisions when it comes to decide
between multiple clone sources, which results in clone operations for
partial extent ranges, which has the following disadvantages:

1) We get less shared extents at the destination;

2) We have to read more data during the send operation and emit more
   write commands.

Besides not being optimal behaviour, it also breaks user expectations and
is often reported by users, with a recent example in the Link tag at the
bottom of this change log.

Part of the reason for this non-optimal behaviour is that the backref
walking code does not provide information about the length of the file
extent items that were found for each backref, so send is blind about
which backref is the best to chose as a cloning source.

The other existing reasons are just silliness, namely always prefering
the inode with the lowest number when multiple are found for the same
root and when we can clone from multiple roots, always prefer the send
root over any of the other clone roots. This does not make any sense
since any inode or root is fine and as good as any other inode/root.

Fix this by making backref walking pass information about the number of
bytes referenced by each file extent item and then have send's backref
callback pick the inode with the highest number of bytes for each root.
Finally select the root from which we can clone more bytes from.

Example reproducer:

   $ cat test.sh
   #!/bin/bash

   DEV=/dev/sdi
   MNT=/mnt/sdi

   mkfs.btrfs -f $DEV
   mount $DEV $MNT

   xfs_io -f -c "pwrite -S 0xab -b 2M 0 2M" $MNT/foo
   cp --reflink=always $MNT/foo $MNT/bar
   cp --reflink=always $MNT/foo $MNT/baz
   sync

   # Overwrite the second half of file foo.
   xfs_io -c "pwrite -S 0xcd -b 1M 1M 1M" $MNT/foo
   sync

   echo
   echo "*** fiemap in the original filesystem ***"
   echo
   xfs_io -c "fiemap -v" $MNT/foo
   xfs_io -c "fiemap -v" $MNT/bar
   xfs_io -c "fiemap -v" $MNT/baz
   echo

   btrfs filesystem du $MNT

   btrfs subvolume snapshot -r $MNT $MNT/snap

   btrfs send -f /tmp/send_stream $MNT/snap

   umount $MNT
   mkfs.btrfs -f $DEV &> /dev/null
   mount $DEV $MNT

   btrfs receive -f /tmp/send_stream $MNT

   echo
   echo "*** fiemap in the new filesystem ***"
   echo
   xfs_io -r -c "fiemap -v" $MNT/snap/foo
   xfs_io -r -c "fiemap -v" $MNT/snap/bar
   xfs_io -r -c "fiemap -v" $MNT/snap/baz
   echo

   btrfs filesystem du $MNT

   rm -f /tmp/send_stream
   rm -f /tmp/snap.fssum

   umount $MNT

Before this change:

   $ ./test.sh
   (...)

   *** fiemap in the original filesystem ***

   /mnt/sdi/foo:
    EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
      0: [0..2047]:       26624..28671      2048 0x2000
      1: [2048..4095]:    30720..32767      2048   0x1
   /mnt/sdi/bar:
    EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
      0: [0..4095]:       26624..30719      4096 0x2001
   /mnt/sdi/baz:
    EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
      0: [0..4095]:       26624..30719      4096 0x2001

        Total   Exclusive  Set shared  Filename
      2.00MiB     1.00MiB           -  /mnt/sdi/foo
      2.00MiB       0.00B           -  /mnt/sdi/bar
      2.00MiB       0.00B           -  /mnt/sdi/baz
      6.00MiB     1.00MiB     2.00MiB  /mnt/sdi

   Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap'
   At subvol /mnt/sdi/snap
   At subvol snap

   *** fiemap in the new filesystem ***

   /mnt/sdi/snap/foo:
    EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
      0: [0..4095]:       26624..30719      4096 0x2001
   /mnt/sdi/snap/bar:
    EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
      0: [0..2047]:       26624..28671      2048 0x2000
      1: [2048..4095]:    30720..32767      2048   0x1
   /mnt/sdi/snap/baz:
    EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
      0: [0..2047]:       26624..28671      2048 0x2000
      1: [2048..4095]:    32768..34815      2048   0x1

        Total   Exclusive  Set shared  Filename
      2.00MiB       0.00B           -  /mnt/sdi/snap/foo
      2.00MiB     1.00MiB           -  /mnt/sdi/snap/bar
      2.00MiB     1.00MiB           -  /mnt/sdi/snap/baz
      6.00MiB     2.00MiB           -  /mnt/sdi/snap
      6.00MiB     2.00MiB     2.00MiB  /mnt/sdi

We end up with two 1M extents that are not shared for files bar and baz.

After this change:

   $ ./test.sh
   (...)

   *** fiemap in the original filesystem ***

   /mnt/sdi/foo:
    EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
      0: [0..2047]:       26624..28671      2048 0x2000
      1: [2048..4095]:    30720..32767      2048   0x1
   /mnt/sdi/bar:
    EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
      0: [0..4095]:       26624..30719      4096 0x2001
   /mnt/sdi/baz:
    EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
      0: [0..4095]:       26624..30719      4096 0x2001

        Total   Exclusive  Set shared  Filename
      2.00MiB     1.00MiB           -  /mnt/sdi/foo
      2.00MiB       0.00B           -  /mnt/sdi/bar
      2.00MiB       0.00B           -  /mnt/sdi/baz
      6.00MiB     1.00MiB     2.00MiB  /mnt/sdi
   Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap'
   At subvol /mnt/sdi/snap
   At subvol snap

   *** fiemap in the new filesystem ***

   /mnt/sdi/snap/foo:
    EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
      0: [0..4095]:       26624..30719      4096 0x2001
   /mnt/sdi/snap/bar:
    EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
      0: [0..2047]:       26624..28671      2048 0x2000
      1: [2048..4095]:    30720..32767      2048 0x2001
   /mnt/sdi/snap/baz:
    EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
      0: [0..2047]:       26624..28671      2048 0x2000
      1: [2048..4095]:    30720..32767      2048 0x2001

        Total   Exclusive  Set shared  Filename
      2.00MiB       0.00B           -  /mnt/sdi/snap/foo
      2.00MiB       0.00B           -  /mnt/sdi/snap/bar
      2.00MiB       0.00B           -  /mnt/sdi/snap/baz
      6.00MiB       0.00B           -  /mnt/sdi/snap
      6.00MiB       0.00B     3.00MiB  /mnt/sdi

Now there's a much better sharing, files bar and baz share 1M of the
extent of file foo and the second extent of files bar and baz is shared
between themselves.

This will later be turned into a test case for fstests.

Link: https://lore.kernel.org/linux-btrfs/20221008005704.795b44b0@crass-HP-ZBook-15-G2/
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c |  9 +++++----
 fs/btrfs/backref.h |  4 ++--
 fs/btrfs/scrub.c   |  4 ++--
 fs/btrfs/send.c    | 49 +++++++++++++++++++++++++++++++++----------------
 4 files changed, 42 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 04cb608e7cfb..9be11a342de5 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -27,6 +27,7 @@
 struct extent_inode_elem {
 	u64 inum;
 	u64 offset;
+	u64 num_bytes;
 	struct extent_inode_elem *next;
 };
 
@@ -37,6 +38,7 @@ static int check_extent_in_eb(const struct btrfs_key *key,
 			      struct extent_inode_elem **eie,
 			      bool ignore_offset)
 {
+	const u64 data_len = btrfs_file_extent_num_bytes(eb, fi);
 	u64 offset = 0;
 	struct extent_inode_elem *e;
 
@@ -45,10 +47,8 @@ static int check_extent_in_eb(const struct btrfs_key *key,
 	    !btrfs_file_extent_encryption(eb, fi) &&
 	    !btrfs_file_extent_other_encoding(eb, fi)) {
 		u64 data_offset;
-		u64 data_len;
 
 		data_offset = btrfs_file_extent_offset(eb, fi);
-		data_len = btrfs_file_extent_num_bytes(eb, fi);
 
 		if (extent_item_pos < data_offset ||
 		    extent_item_pos >= data_offset + data_len)
@@ -63,6 +63,7 @@ static int check_extent_in_eb(const struct btrfs_key *key,
 	e->next = *eie;
 	e->inum = key->objectid;
 	e->offset = key->offset + offset;
+	e->num_bytes = data_len;
 	*eie = e;
 
 	return 0;
@@ -2265,7 +2266,7 @@ static int iterate_leaf_refs(struct btrfs_fs_info *fs_info,
 			    "ref for %llu resolved, key (%llu EXTEND_DATA %llu), root %llu",
 			    extent_item_objectid, eie->inum,
 			    eie->offset, root);
-		ret = iterate(eie->inum, eie->offset, root, ctx);
+		ret = iterate(eie->inum, eie->offset, eie->num_bytes, root, ctx);
 		if (ret) {
 			btrfs_debug(fs_info,
 				    "stopping iteration for %llu due to ret=%d",
@@ -2357,7 +2358,7 @@ out:
 	return ret;
 }
 
-static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
+static int build_ino_list(u64 inum, u64 offset, u64 num_bytes, u64 root, void *ctx)
 {
 	struct btrfs_data_container *inodes = ctx;
 	const size_t c = 3 * sizeof(u64);
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 2dd68f87dca5..8d3598155f3b 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -75,8 +75,8 @@ struct btrfs_backref_share_check_ctx {
 	int prev_extents_cache_slot;
 };
 
-typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
-		void *ctx);
+typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 num_bytes,
+				      u64 root, void *ctx);
 
 struct btrfs_backref_share_check_ctx *btrfs_alloc_backref_share_check_ctx(void);
 void btrfs_free_backref_share_ctx(struct btrfs_backref_share_check_ctx *ctx);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 8c89e63ef2e8..3c22573bfe0d 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -809,8 +809,8 @@ nomem:
 	return ERR_PTR(-ENOMEM);
 }
 
-static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
-				     void *warn_ctx)
+static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
+				     u64 root, void *warn_ctx)
 {
 	u32 nlink;
 	int ret;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index d9e01b66b32b..49759cd9eecb 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -76,7 +76,7 @@ struct clone_root {
 	struct btrfs_root *root;
 	u64 ino;
 	u64 offset;
-
+	u64 num_bytes;
 	u64 found_refs;
 };
 
@@ -1273,7 +1273,8 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2)
  * Called for every backref that is found for the current extent.
  * Results are collected in sctx->clone_roots->ino/offset/found_refs
  */
-static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
+static int __iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root,
+			      void *ctx_)
 {
 	struct backref_ctx *bctx = ctx_;
 	struct clone_root *found;
@@ -1318,15 +1319,17 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
 
 	bctx->found++;
 	found->found_refs++;
-	if (ino < found->ino) {
+
+	/*
+	 * If the given backref refers to a file extent item with a larger
+	 * number of bytes than what we found before, use the new one so that
+	 * we clone more optimally and end up doing less writes and getting
+	 * less exclusive, non-shared extents at the destination.
+	 */
+	if (num_bytes > found->num_bytes) {
 		found->ino = ino;
 		found->offset = offset;
-	} else if (found->ino == ino) {
-		/*
-		 * same extent found more then once in the same file.
-		 */
-		if (found->offset > offset + bctx->extent_len)
-			found->offset = offset;
+		found->num_bytes = num_bytes;
 	}
 
 	return 0;
@@ -1437,6 +1440,7 @@ static int find_extent_clone(struct send_ctx *sctx,
 		cur_clone_root = sctx->clone_roots + i;
 		cur_clone_root->ino = (u64)-1;
 		cur_clone_root->offset = 0;
+		cur_clone_root->num_bytes = 0;
 		cur_clone_root->found_refs = 0;
 	}
 
@@ -1506,14 +1510,27 @@ static int find_extent_clone(struct send_ctx *sctx,
 
 	cur_clone_root = NULL;
 	for (i = 0; i < sctx->clone_roots_cnt; i++) {
-		if (sctx->clone_roots[i].found_refs) {
-			if (!cur_clone_root)
-				cur_clone_root = sctx->clone_roots + i;
-			else if (sctx->clone_roots[i].root == sctx->send_root)
-				/* prefer clones from send_root over others */
-				cur_clone_root = sctx->clone_roots + i;
-		}
+		struct clone_root *clone_root = &sctx->clone_roots[i];
 
+		if (clone_root->found_refs == 0)
+			continue;
+
+		/*
+		 * Choose the root from which we can clone more bytes, to
+		 * minimize write operations and therefore have more extent
+		 * sharing at the destination (the same as in the source).
+		 */
+		if (!cur_clone_root ||
+		    clone_root->num_bytes > cur_clone_root->num_bytes) {
+			cur_clone_root = clone_root;
+
+			/*
+			 * We found an optimal clone candidate (any inode from
+			 * any root is fine), so we're done.
+			 */
+			if (clone_root->num_bytes >= backref_ctx.extent_len)
+				break;
+		}
 	}
 
 	if (cur_clone_root) {
-- 
cgit 


From 6ce6ba534418132f4c727d5707fe2794c797299c Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 1 Nov 2022 16:15:46 +0000
Subject: btrfs: use a single argument for extent offset in backref walking
 functions

The interface for find_parent_nodes() has two extent offset related
arguments:

1) One u64 pointer argument for the extent offset;

2) One boolean argument to tell if the extent offset should be ignored or
   not.

These are confusing, becase the extent offset pointer can be NULL and in
some cases callers pass a NULL value as a way to tell the backref walking
code to ignore offsets in file extent items (and simply consider all file
extent items that point to the target data extent).

The boolean argument was added in commit c995ab3cda3f ("btrfs: add a flag
to iterate_inodes_from_logical to find all extent refs for uncompressed
extents"), but it was never really necessary, it was enough if it could
find a way to get a NULL value passed to the "extent_item_pos" argument of
find_parent_nodes(). The arguments are also passed to functions called
by find_parent_nodes() and respective helper functions, which further
makes everything more complicated than needed.

Then we have several backref walking related functions that end up calling
find_parent_nodes(), either directly or through some other function that
they call, and for many we have to use an "extent_item_pos" (u64) argument
and a boolean "ignore_offset" argument too.

This is confusing and not really necessary. So use a single argument to
specify the extent offset, as a simple u64 and not as a pointer, but
using a special value of (u64)-1, defined as a documented constant, to
indicate when the extent offset should be ignored.

This is also preparation work for the upcoming patches in the series that
add other arguments to find_parent_nodes() and other related functions
that use it.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c    | 87 +++++++++++++++++++++++++--------------------------
 fs/btrfs/backref.h    | 12 +++++--
 fs/btrfs/relocation.c |  2 +-
 fs/btrfs/scrub.c      |  2 +-
 fs/btrfs/send.c       |  2 +-
 5 files changed, 54 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 9be11a342de5..432064ee788e 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -35,15 +35,13 @@ static int check_extent_in_eb(const struct btrfs_key *key,
 			      const struct extent_buffer *eb,
 			      const struct btrfs_file_extent_item *fi,
 			      u64 extent_item_pos,
-			      struct extent_inode_elem **eie,
-			      bool ignore_offset)
+			      struct extent_inode_elem **eie)
 {
 	const u64 data_len = btrfs_file_extent_num_bytes(eb, fi);
 	u64 offset = 0;
 	struct extent_inode_elem *e;
 
-	if (!ignore_offset &&
-	    !btrfs_file_extent_compression(eb, fi) &&
+	if (!btrfs_file_extent_compression(eb, fi) &&
 	    !btrfs_file_extent_encryption(eb, fi) &&
 	    !btrfs_file_extent_other_encoding(eb, fi)) {
 		u64 data_offset;
@@ -81,8 +79,7 @@ static void free_inode_elem_list(struct extent_inode_elem *eie)
 
 static int find_extent_in_eb(const struct extent_buffer *eb,
 			     u64 wanted_disk_byte, u64 extent_item_pos,
-			     struct extent_inode_elem **eie,
-			     bool ignore_offset)
+			     struct extent_inode_elem **eie)
 {
 	u64 disk_byte;
 	struct btrfs_key key;
@@ -111,7 +108,7 @@ static int find_extent_in_eb(const struct extent_buffer *eb,
 		if (disk_byte != wanted_disk_byte)
 			continue;
 
-		ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie, ignore_offset);
+		ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie);
 		if (ret < 0)
 			return ret;
 	}
@@ -450,9 +447,9 @@ static int is_shared_data_backref(struct preftrees *preftrees, u64 bytenr)
 static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
 			   struct ulist *parents,
 			   struct preftrees *preftrees, struct prelim_ref *ref,
-			   int level, u64 time_seq, const u64 *extent_item_pos,
-			   bool ignore_offset)
+			   int level, u64 time_seq, u64 extent_item_pos)
 {
+	const bool ignore_offset = (extent_item_pos == BTRFS_IGNORE_EXTENT_OFFSET);
 	int ret = 0;
 	int slot;
 	struct extent_buffer *eb;
@@ -528,10 +525,9 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
 				count++;
 			else
 				goto next;
-			if (extent_item_pos) {
+			if (!ignore_offset) {
 				ret = check_extent_in_eb(&key, eb, fi,
-						*extent_item_pos,
-						&eie, ignore_offset);
+							 extent_item_pos, &eie);
 				if (ret < 0)
 					break;
 			}
@@ -541,7 +537,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
 						  eie, (void **)&old, GFP_NOFS);
 			if (ret < 0)
 				break;
-			if (!ret && extent_item_pos) {
+			if (!ret && !ignore_offset) {
 				while (old->next)
 					old = old->next;
 				old->next = eie;
@@ -570,7 +566,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
 				struct btrfs_path *path, u64 time_seq,
 				struct preftrees *preftrees,
 				struct prelim_ref *ref, struct ulist *parents,
-				const u64 *extent_item_pos, bool ignore_offset)
+				u64 extent_item_pos)
 {
 	struct btrfs_root *root;
 	struct extent_buffer *eb;
@@ -664,7 +660,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
 	}
 
 	ret = add_all_parents(root, path, parents, preftrees, ref, level,
-			      time_seq, extent_item_pos, ignore_offset);
+			      time_seq, extent_item_pos);
 out:
 	btrfs_put_root(root);
 out_free:
@@ -712,8 +708,8 @@ static void free_leaf_list(struct ulist *ulist)
 static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 				 struct btrfs_path *path, u64 time_seq,
 				 struct preftrees *preftrees,
-				 const u64 *extent_item_pos,
-				 struct share_check *sc, bool ignore_offset)
+				 u64 extent_item_pos,
+				 struct share_check *sc)
 {
 	int err;
 	int ret = 0;
@@ -756,8 +752,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 			goto out;
 		}
 		err = resolve_indirect_ref(fs_info, path, time_seq, preftrees,
-					   ref, parents, extent_item_pos,
-					   ignore_offset);
+					   ref, parents, extent_item_pos);
 		/*
 		 * we can only tolerate ENOENT,otherwise,we should catch error
 		 * and return directly.
@@ -1340,18 +1335,21 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx
  *
  * Otherwise this returns 0 for success and <0 for an error.
  *
- * If ignore_offset is set to false, only extent refs whose offsets match
- * extent_item_pos are returned.  If true, every extent ref is returned
- * and extent_item_pos is ignored.
+ * @extent_item_pos is meaningful only if we are dealing with a data extent.
+ * If its value is not BTRFS_IGNORE_EXTENT_OFFSET, then only collect references
+ * from file extent items that refer to a section of the data extent that
+ * contains @extent_item_pos. If its value is BTRFS_IGNORE_EXTENT_OFFSET then
+ * collect references for every file extent item that points to the data extent.
  *
  * FIXME some caching might speed things up
  */
 static int find_parent_nodes(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info, u64 bytenr,
 			     u64 time_seq, struct ulist *refs,
-			     struct ulist *roots, const u64 *extent_item_pos,
-			     struct share_check *sc, bool ignore_offset)
+			     struct ulist *roots, u64 extent_item_pos,
+			     struct share_check *sc)
 {
+	const bool ignore_offset = (extent_item_pos == BTRFS_IGNORE_EXTENT_OFFSET);
 	struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr);
 	struct btrfs_key key;
 	struct btrfs_path *path;
@@ -1539,7 +1537,7 @@ again:
 	WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root.rb_root));
 
 	ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees,
-				    extent_item_pos, sc, ignore_offset);
+				    extent_item_pos, sc);
 	if (ret)
 		goto out;
 
@@ -1573,8 +1571,7 @@ again:
 				goto out;
 		}
 		if (ref->count && ref->parent) {
-			if (extent_item_pos && !ref->inode_list &&
-			    ref->level == 0) {
+			if (!ignore_offset && !ref->inode_list && ref->level == 0) {
 				struct extent_buffer *eb;
 
 				eb = read_tree_block(fs_info, ref->parent, 0,
@@ -1592,7 +1589,7 @@ again:
 				if (!path->skip_locking)
 					btrfs_tree_read_lock(eb);
 				ret = find_extent_in_eb(eb, bytenr,
-							*extent_item_pos, &eie, ignore_offset);
+							extent_item_pos, &eie);
 				if (!path->skip_locking)
 					btrfs_tree_read_unlock(eb);
 				free_extent_buffer(eb);
@@ -1611,7 +1608,7 @@ again:
 						  (void **)&eie, GFP_NOFS);
 			if (ret < 0)
 				goto out;
-			if (!ret && extent_item_pos) {
+			if (!ret && !ignore_offset) {
 				/*
 				 * We've recorded that parent, so we must extend
 				 * its inode list here.
@@ -1665,7 +1662,7 @@ out:
 int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
 			 struct btrfs_fs_info *fs_info, u64 bytenr,
 			 u64 time_seq, struct ulist **leafs,
-			 const u64 *extent_item_pos, bool ignore_offset)
+			 u64 extent_item_pos)
 {
 	int ret;
 
@@ -1674,7 +1671,7 @@ int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 
 	ret = find_parent_nodes(trans, fs_info, bytenr, time_seq,
-				*leafs, NULL, extent_item_pos, NULL, ignore_offset);
+				*leafs, NULL, extent_item_pos, NULL);
 	if (ret < 0 && ret != -ENOENT) {
 		free_leaf_list(*leafs);
 		return ret;
@@ -1698,8 +1695,7 @@ int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
  */
 static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans,
 				     struct btrfs_fs_info *fs_info, u64 bytenr,
-				     u64 time_seq, struct ulist **roots,
-				     bool ignore_offset)
+				     u64 time_seq, struct ulist **roots)
 {
 	struct ulist *tmp;
 	struct ulist_node *node = NULL;
@@ -1718,7 +1714,8 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans,
 	ULIST_ITER_INIT(&uiter);
 	while (1) {
 		ret = find_parent_nodes(trans, fs_info, bytenr, time_seq,
-					tmp, *roots, NULL, NULL, ignore_offset);
+					tmp, *roots, BTRFS_IGNORE_EXTENT_OFFSET,
+					NULL);
 		if (ret < 0 && ret != -ENOENT) {
 			ulist_free(tmp);
 			ulist_free(*roots);
@@ -1745,8 +1742,7 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
 
 	if (!trans && !skip_commit_root_sem)
 		down_read(&fs_info->commit_root_sem);
-	ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr,
-					time_seq, roots, false);
+	ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr, time_seq, roots);
 	if (!trans && !skip_commit_root_sem)
 		up_read(&fs_info->commit_root_sem);
 	return ret;
@@ -1845,7 +1841,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
 		bool cached;
 
 		ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, &ctx->refs,
-					NULL, NULL, &shared, false);
+					NULL, BTRFS_IGNORE_EXTENT_OFFSET, &shared);
 		if (ret == BACKREF_FOUND_SHARED ||
 		    ret == BACKREF_FOUND_NOT_SHARED) {
 			/* If shared must return 1, otherwise return 0. */
@@ -2286,8 +2282,7 @@ static int iterate_leaf_refs(struct btrfs_fs_info *fs_info,
 int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
 				u64 extent_item_objectid, u64 extent_item_pos,
 				int search_commit_root,
-				iterate_extent_inodes_t *iterate, void *ctx,
-				bool ignore_offset)
+				iterate_extent_inodes_t *iterate, void *ctx)
 {
 	int ret;
 	struct btrfs_trans_handle *trans = NULL;
@@ -2318,16 +2313,14 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
 		down_read(&fs_info->commit_root_sem);
 
 	ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
-				   seq_elem.seq, &refs,
-				   &extent_item_pos, ignore_offset);
+				   seq_elem.seq, &refs, extent_item_pos);
 	if (ret)
 		goto out;
 
 	ULIST_ITER_INIT(&ref_uiter);
 	while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
 		ret = btrfs_find_all_roots_safe(trans, fs_info, ref_node->val,
-						seq_elem.seq, &roots,
-						ignore_offset);
+						seq_elem.seq, &roots);
 		if (ret)
 			break;
 		ULIST_ITER_INIT(&root_uiter);
@@ -2395,10 +2388,14 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
 	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
 		return -EINVAL;
 
-	extent_item_pos = logical - found_key.objectid;
+	if (ignore_offset)
+		extent_item_pos = BTRFS_IGNORE_EXTENT_OFFSET;
+	else
+		extent_item_pos = logical - found_key.objectid;
+
 	ret = iterate_extent_inodes(fs_info, found_key.objectid,
 					extent_item_pos, search_commit_root,
-					build_ino_list, ctx, ignore_offset);
+					build_ino_list, ctx);
 
 	return ret;
 }
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 8d3598155f3b..2eb99f23cc8f 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -12,6 +12,13 @@
 #include "disk-io.h"
 #include "extent_io.h"
 
+/*
+ * Pass to backref walking functions to tell them to include references from
+ * all file extent items that point to the target data extent, regardless if
+ * they refer to the whole extent or just sections of it (bookend extents).
+ */
+#define BTRFS_IGNORE_EXTENT_OFFSET   ((u64)-1)
+
 struct inode_fs_paths {
 	struct btrfs_path		*btrfs_path;
 	struct btrfs_root		*fs_root;
@@ -92,8 +99,7 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
 int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
 				u64 extent_item_objectid,
 				u64 extent_offset, int search_commit_root,
-				iterate_extent_inodes_t *iterate, void *ctx,
-				bool ignore_offset);
+				iterate_extent_inodes_t *iterate, void *ctx);
 
 int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
 				struct btrfs_path *path, void *ctx,
@@ -104,7 +110,7 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
 int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
 			 struct btrfs_fs_info *fs_info, u64 bytenr,
 			 u64 time_seq, struct ulist **leafs,
-			 const u64 *extent_item_pos, bool ignore_offset);
+			 u64 extent_item_pos);
 int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
 			 struct btrfs_fs_info *fs_info, u64 bytenr,
 			 u64 time_seq, struct ulist **roots,
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index d119986d1599..45690f7b5900 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3417,7 +3417,7 @@ int add_data_references(struct reloc_control *rc,
 
 	btrfs_release_path(path);
 	ret = btrfs_find_all_leafs(NULL, fs_info, extent_key->objectid,
-				   0, &leaves, NULL, true);
+				   0, &leaves, BTRFS_IGNORE_EXTENT_OFFSET);
 	if (ret < 0)
 		return ret;
 
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 3c22573bfe0d..6c7dc89709fc 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -969,7 +969,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 		swarn.dev = dev;
 		iterate_extent_inodes(fs_info, found_key.objectid,
 					extent_item_pos, 1,
-					scrub_print_warning_inode, &swarn, false);
+					scrub_print_warning_inode, &swarn);
 	}
 
 out:
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 49759cd9eecb..6bf06939f891 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1467,7 +1467,7 @@ static int find_extent_clone(struct send_ctx *sctx,
 		extent_item_pos = 0;
 	ret = iterate_extent_inodes(fs_info, found_key.objectid,
 				    extent_item_pos, 1, __iterate_backrefs,
-				    &backref_ctx, false);
+				    &backref_ctx);
 
 	if (ret < 0)
 		goto out;
-- 
cgit 


From a2c8d27e5ee810b7149b42b88ddf7298e5b8dfe0 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 1 Nov 2022 16:15:47 +0000
Subject: btrfs: use a structure to pass arguments to backref walking functions

The public backref walking functions have quite a lot of arguments that
are passed down the call stack to find_parent_nodes(), the core function
of the backref walking code.

The next patches in series will need to add even arguments to these
functions that should be passed not only to find_parent_nodes(), but also
to other functions used by the later (directly or even lower in the call
stack).

So create a structure to hold all these arguments and state used by the
main backref walking function, find_parent_nodes(), and use it as the
argument for the public backref walking functions iterate_extent_inodes(),
btrfs_find_all_leafs() and btrfs_find_all_roots().

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c            | 343 ++++++++++++++++++++++--------------------
 fs/btrfs/backref.h            |  76 ++++++++--
 fs/btrfs/qgroup.c             |  38 +++--
 fs/btrfs/relocation.c         |  19 ++-
 fs/btrfs/scrub.c              |  14 +-
 fs/btrfs/send.c               |  15 +-
 fs/btrfs/tests/qgroup-tests.c |  50 ++++--
 7 files changed, 328 insertions(+), 227 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 432064ee788e..a1a00a7bdba5 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -444,12 +444,12 @@ static int is_shared_data_backref(struct preftrees *preftrees, u64 bytenr)
 	return 0;
 }
 
-static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
+static int add_all_parents(struct btrfs_backref_walk_ctx *ctx,
+			   struct btrfs_root *root, struct btrfs_path *path,
 			   struct ulist *parents,
 			   struct preftrees *preftrees, struct prelim_ref *ref,
-			   int level, u64 time_seq, u64 extent_item_pos)
+			   int level)
 {
-	const bool ignore_offset = (extent_item_pos == BTRFS_IGNORE_EXTENT_OFFSET);
 	int ret = 0;
 	int slot;
 	struct extent_buffer *eb;
@@ -484,10 +484,10 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
 	if (path->slots[0] >= btrfs_header_nritems(eb) ||
 	    is_shared_data_backref(preftrees, eb->start) ||
 	    ref->root_id != btrfs_header_owner(eb)) {
-		if (time_seq == BTRFS_SEQ_LAST)
+		if (ctx->time_seq == BTRFS_SEQ_LAST)
 			ret = btrfs_next_leaf(root, path);
 		else
-			ret = btrfs_next_old_leaf(root, path, time_seq);
+			ret = btrfs_next_old_leaf(root, path, ctx->time_seq);
 	}
 
 	while (!ret && count < ref->count) {
@@ -508,10 +508,10 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
 		if (slot == 0 &&
 		    (is_shared_data_backref(preftrees, eb->start) ||
 		     ref->root_id != btrfs_header_owner(eb))) {
-			if (time_seq == BTRFS_SEQ_LAST)
+			if (ctx->time_seq == BTRFS_SEQ_LAST)
 				ret = btrfs_next_leaf(root, path);
 			else
-				ret = btrfs_next_old_leaf(root, path, time_seq);
+				ret = btrfs_next_old_leaf(root, path, ctx->time_seq);
 			continue;
 		}
 		fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
@@ -525,9 +525,9 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
 				count++;
 			else
 				goto next;
-			if (!ignore_offset) {
+			if (!ctx->ignore_extent_item_pos) {
 				ret = check_extent_in_eb(&key, eb, fi,
-							 extent_item_pos, &eie);
+							 ctx->extent_item_pos, &eie);
 				if (ret < 0)
 					break;
 			}
@@ -537,7 +537,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
 						  eie, (void **)&old, GFP_NOFS);
 			if (ret < 0)
 				break;
-			if (!ret && !ignore_offset) {
+			if (!ret && !ctx->ignore_extent_item_pos) {
 				while (old->next)
 					old = old->next;
 				old->next = eie;
@@ -545,10 +545,10 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
 			eie = NULL;
 		}
 next:
-		if (time_seq == BTRFS_SEQ_LAST)
+		if (ctx->time_seq == BTRFS_SEQ_LAST)
 			ret = btrfs_next_item(root, path);
 		else
-			ret = btrfs_next_old_item(root, path, time_seq);
+			ret = btrfs_next_old_item(root, path, ctx->time_seq);
 	}
 
 	if (ret > 0)
@@ -562,11 +562,10 @@ next:
  * resolve an indirect backref in the form (root_id, key, level)
  * to a logical address
  */
-static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
-				struct btrfs_path *path, u64 time_seq,
+static int resolve_indirect_ref(struct btrfs_backref_walk_ctx *ctx,
+				struct btrfs_path *path,
 				struct preftrees *preftrees,
-				struct prelim_ref *ref, struct ulist *parents,
-				u64 extent_item_pos)
+				struct prelim_ref *ref, struct ulist *parents)
 {
 	struct btrfs_root *root;
 	struct extent_buffer *eb;
@@ -584,9 +583,9 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
 	 * here.
 	 */
 	if (path->search_commit_root)
-		root = btrfs_get_fs_root_commit_root(fs_info, path, ref->root_id);
+		root = btrfs_get_fs_root_commit_root(ctx->fs_info, path, ref->root_id);
 	else
-		root = btrfs_get_fs_root(fs_info, ref->root_id, false);
+		root = btrfs_get_fs_root(ctx->fs_info, ref->root_id, false);
 	if (IS_ERR(root)) {
 		ret = PTR_ERR(root);
 		goto out_free;
@@ -598,17 +597,17 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
 		goto out;
 	}
 
-	if (btrfs_is_testing(fs_info)) {
+	if (btrfs_is_testing(ctx->fs_info)) {
 		ret = -ENOENT;
 		goto out;
 	}
 
 	if (path->search_commit_root)
 		root_level = btrfs_header_level(root->commit_root);
-	else if (time_seq == BTRFS_SEQ_LAST)
+	else if (ctx->time_seq == BTRFS_SEQ_LAST)
 		root_level = btrfs_header_level(root->node);
 	else
-		root_level = btrfs_old_root_level(root, time_seq);
+		root_level = btrfs_old_root_level(root, ctx->time_seq);
 
 	if (root_level + 1 == level)
 		goto out;
@@ -636,12 +635,12 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
 	    search_key.offset >= LLONG_MAX)
 		search_key.offset = 0;
 	path->lowest_level = level;
-	if (time_seq == BTRFS_SEQ_LAST)
+	if (ctx->time_seq == BTRFS_SEQ_LAST)
 		ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
 	else
-		ret = btrfs_search_old_slot(root, &search_key, path, time_seq);
+		ret = btrfs_search_old_slot(root, &search_key, path, ctx->time_seq);
 
-	btrfs_debug(fs_info,
+	btrfs_debug(ctx->fs_info,
 		"search slot in root %llu (level %d, ref count %d) returned %d for key (%llu %u %llu)",
 		 ref->root_id, level, ref->count, ret,
 		 ref->key_for_search.objectid, ref->key_for_search.type,
@@ -659,8 +658,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
 		eb = path->nodes[level];
 	}
 
-	ret = add_all_parents(root, path, parents, preftrees, ref, level,
-			      time_seq, extent_item_pos);
+	ret = add_all_parents(ctx, root, path, parents, preftrees, ref, level);
 out:
 	btrfs_put_root(root);
 out_free:
@@ -705,10 +703,9 @@ static void free_leaf_list(struct ulist *ulist)
  * rbtree as they are encountered. The new backrefs are subsequently
  * resolved as above.
  */
-static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
-				 struct btrfs_path *path, u64 time_seq,
+static int resolve_indirect_refs(struct btrfs_backref_walk_ctx *ctx,
+				 struct btrfs_path *path,
 				 struct preftrees *preftrees,
-				 u64 extent_item_pos,
 				 struct share_check *sc)
 {
 	int err;
@@ -751,14 +748,13 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 			ret = BACKREF_FOUND_SHARED;
 			goto out;
 		}
-		err = resolve_indirect_ref(fs_info, path, time_seq, preftrees,
-					   ref, parents, extent_item_pos);
+		err = resolve_indirect_ref(ctx, path, preftrees, ref, parents);
 		/*
 		 * we can only tolerate ENOENT,otherwise,we should catch error
 		 * and return directly.
 		 */
 		if (err == -ENOENT) {
-			prelim_ref_insert(fs_info, &preftrees->direct, ref,
+			prelim_ref_insert(ctx->fs_info, &preftrees->direct, ref,
 					  NULL);
 			continue;
 		} else if (err) {
@@ -787,7 +783,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 			memcpy(new_ref, ref, sizeof(*ref));
 			new_ref->parent = node->val;
 			new_ref->inode_list = unode_aux_to_inode_list(node);
-			prelim_ref_insert(fs_info, &preftrees->direct,
+			prelim_ref_insert(ctx->fs_info, &preftrees->direct,
 					  new_ref, NULL);
 		}
 
@@ -795,7 +791,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 		 * Now it's a direct ref, put it in the direct tree. We must
 		 * do this last because the ref could be merged/freed here.
 		 */
-		prelim_ref_insert(fs_info, &preftrees->direct, ref, NULL);
+		prelim_ref_insert(ctx->fs_info, &preftrees->direct, ref, NULL);
 
 		ulist_reinit(parents);
 		cond_resched();
@@ -1325,32 +1321,18 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx
  * indirect refs to their parent bytenr.
  * When roots are found, they're added to the roots list
  *
- * If time_seq is set to BTRFS_SEQ_LAST, it will not search delayed_refs, and
- * behave much like trans == NULL case, the difference only lies in it will not
- * commit root.
- * The special case is for qgroup to search roots in commit_transaction().
- *
- * @sc - if !NULL, then immediately return BACKREF_FOUND_SHARED when a
- * shared extent is detected.
+ * @ctx:     Backref walking context object, must be not NULL.
+ * @sc:      If !NULL, then immediately return BACKREF_FOUND_SHARED when a
+ *           shared extent is detected.
  *
  * Otherwise this returns 0 for success and <0 for an error.
  *
- * @extent_item_pos is meaningful only if we are dealing with a data extent.
- * If its value is not BTRFS_IGNORE_EXTENT_OFFSET, then only collect references
- * from file extent items that refer to a section of the data extent that
- * contains @extent_item_pos. If its value is BTRFS_IGNORE_EXTENT_OFFSET then
- * collect references for every file extent item that points to the data extent.
- *
  * FIXME some caching might speed things up
  */
-static int find_parent_nodes(struct btrfs_trans_handle *trans,
-			     struct btrfs_fs_info *fs_info, u64 bytenr,
-			     u64 time_seq, struct ulist *refs,
-			     struct ulist *roots, u64 extent_item_pos,
+static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx,
 			     struct share_check *sc)
 {
-	const bool ignore_offset = (extent_item_pos == BTRFS_IGNORE_EXTENT_OFFSET);
-	struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr);
+	struct btrfs_root *root = btrfs_extent_root(ctx->fs_info, ctx->bytenr);
 	struct btrfs_key key;
 	struct btrfs_path *path;
 	struct btrfs_delayed_ref_root *delayed_refs = NULL;
@@ -1368,11 +1350,11 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 
 	/* Roots ulist is not needed when using a sharedness check context. */
 	if (sc)
-		ASSERT(roots == NULL);
+		ASSERT(ctx->roots == NULL);
 
-	key.objectid = bytenr;
+	key.objectid = ctx->bytenr;
 	key.offset = (u64)-1;
-	if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+	if (btrfs_fs_incompat(ctx->fs_info, SKINNY_METADATA))
 		key.type = BTRFS_METADATA_ITEM_KEY;
 	else
 		key.type = BTRFS_EXTENT_ITEM_KEY;
@@ -1380,12 +1362,12 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-	if (!trans) {
+	if (!ctx->trans) {
 		path->search_commit_root = 1;
 		path->skip_locking = 1;
 	}
 
-	if (time_seq == BTRFS_SEQ_LAST)
+	if (ctx->time_seq == BTRFS_SEQ_LAST)
 		path->skip_locking = 1;
 
 again:
@@ -1401,17 +1383,17 @@ again:
 		goto out;
 	}
 
-	if (trans && likely(trans->type != __TRANS_DUMMY) &&
-	    time_seq != BTRFS_SEQ_LAST) {
+	if (ctx->trans && likely(ctx->trans->type != __TRANS_DUMMY) &&
+	    ctx->time_seq != BTRFS_SEQ_LAST) {
 		/*
 		 * We have a specific time_seq we care about and trans which
 		 * means we have the path lock, we need to grab the ref head and
 		 * lock it so we have a consistent view of the refs at the given
 		 * time.
 		 */
-		delayed_refs = &trans->transaction->delayed_refs;
+		delayed_refs = &ctx->trans->transaction->delayed_refs;
 		spin_lock(&delayed_refs->lock);
-		head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
+		head = btrfs_find_delayed_ref_head(delayed_refs, ctx->bytenr);
 		if (head) {
 			if (!mutex_trylock(&head->mutex)) {
 				refcount_inc(&head->refs);
@@ -1429,7 +1411,7 @@ again:
 				goto again;
 			}
 			spin_unlock(&delayed_refs->lock);
-			ret = add_delayed_refs(fs_info, head, time_seq,
+			ret = add_delayed_refs(ctx->fs_info, head, ctx->time_seq,
 					       &preftrees, sc);
 			mutex_unlock(&head->mutex);
 			if (ret)
@@ -1447,14 +1429,14 @@ again:
 		leaf = path->nodes[0];
 		slot = path->slots[0];
 		btrfs_item_key_to_cpu(leaf, &key, slot);
-		if (key.objectid == bytenr &&
+		if (key.objectid == ctx->bytenr &&
 		    (key.type == BTRFS_EXTENT_ITEM_KEY ||
 		     key.type == BTRFS_METADATA_ITEM_KEY)) {
-			ret = add_inline_refs(fs_info, path, bytenr,
+			ret = add_inline_refs(ctx->fs_info, path, ctx->bytenr,
 					      &info_level, &preftrees, sc);
 			if (ret)
 				goto out;
-			ret = add_keyed_refs(root, path, bytenr, info_level,
+			ret = add_keyed_refs(root, path, ctx->bytenr, info_level,
 					     &preftrees, sc);
 			if (ret)
 				goto out;
@@ -1483,7 +1465,7 @@ again:
 	 * extent item pointing to the data extent) is shared, that is, if any
 	 * of the extent buffers in the path is referenced by other trees.
 	 */
-	if (sc && bytenr == sc->data_bytenr) {
+	if (sc && ctx->bytenr == sc->data_bytenr) {
 		/*
 		 * If our data extent is from a generation more recent than the
 		 * last generation used to snapshot the root, then we know that
@@ -1530,14 +1512,13 @@ again:
 
 	btrfs_release_path(path);
 
-	ret = add_missing_keys(fs_info, &preftrees, path->skip_locking == 0);
+	ret = add_missing_keys(ctx->fs_info, &preftrees, path->skip_locking == 0);
 	if (ret)
 		goto out;
 
 	WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root.rb_root));
 
-	ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees,
-				    extent_item_pos, sc);
+	ret = resolve_indirect_refs(ctx, path, &preftrees, sc);
 	if (ret)
 		goto out;
 
@@ -1564,17 +1545,18 @@ again:
 		 * e.g. different offsets would not be merged,
 		 * and would retain their original ref->count < 0.
 		 */
-		if (roots && ref->count && ref->root_id && ref->parent == 0) {
+		if (ctx->roots && ref->count && ref->root_id && ref->parent == 0) {
 			/* no parent == root of tree */
-			ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
+			ret = ulist_add(ctx->roots, ref->root_id, 0, GFP_NOFS);
 			if (ret < 0)
 				goto out;
 		}
 		if (ref->count && ref->parent) {
-			if (!ignore_offset && !ref->inode_list && ref->level == 0) {
+			if (!ctx->ignore_extent_item_pos && !ref->inode_list &&
+			    ref->level == 0) {
 				struct extent_buffer *eb;
 
-				eb = read_tree_block(fs_info, ref->parent, 0,
+				eb = read_tree_block(ctx->fs_info, ref->parent, 0,
 						     0, ref->level, NULL);
 				if (IS_ERR(eb)) {
 					ret = PTR_ERR(eb);
@@ -1588,8 +1570,8 @@ again:
 
 				if (!path->skip_locking)
 					btrfs_tree_read_lock(eb);
-				ret = find_extent_in_eb(eb, bytenr,
-							extent_item_pos, &eie);
+				ret = find_extent_in_eb(eb, ctx->bytenr,
+							ctx->extent_item_pos, &eie);
 				if (!path->skip_locking)
 					btrfs_tree_read_unlock(eb);
 				free_extent_buffer(eb);
@@ -1603,12 +1585,12 @@ again:
 				 */
 				eie = NULL;
 			}
-			ret = ulist_add_merge_ptr(refs, ref->parent,
+			ret = ulist_add_merge_ptr(ctx->refs, ref->parent,
 						  ref->inode_list,
 						  (void **)&eie, GFP_NOFS);
 			if (ret < 0)
 				goto out;
-			if (!ret && !ignore_offset) {
+			if (!ret && !ctx->ignore_extent_item_pos) {
 				/*
 				 * We've recorded that parent, so we must extend
 				 * its inode list here.
@@ -1652,28 +1634,29 @@ out:
 }
 
 /*
- * Finds all leafs with a reference to the specified combination of bytenr and
- * offset. key_list_head will point to a list of corresponding keys (caller must
- * free each list element). The leafs will be stored in the leafs ulist, which
- * must be freed with ulist_free.
+ * Finds all leaves with a reference to the specified combination of
+ * @ctx->bytenr and @ctx->extent_item_pos. The bytenr of the found leaves are
+ * added to the ulist at @ctx->refs, and that ulist is allocated by this
+ * function. The caller should free the ulist with free_leaf_list() if
+ * @ctx->ignore_extent_item_pos is false, otherwise a fimple ulist_free() is
+ * enough.
  *
- * returns 0 on success, <0 on error
+ * Returns 0 on success and < 0 on error. On error @ctx->refs is not allocated.
  */
-int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
-			 struct btrfs_fs_info *fs_info, u64 bytenr,
-			 u64 time_seq, struct ulist **leafs,
-			 u64 extent_item_pos)
+int btrfs_find_all_leafs(struct btrfs_backref_walk_ctx *ctx)
 {
 	int ret;
 
-	*leafs = ulist_alloc(GFP_NOFS);
-	if (!*leafs)
+	ASSERT(ctx->refs == NULL);
+
+	ctx->refs = ulist_alloc(GFP_NOFS);
+	if (!ctx->refs)
 		return -ENOMEM;
 
-	ret = find_parent_nodes(trans, fs_info, bytenr, time_seq,
-				*leafs, NULL, extent_item_pos, NULL);
+	ret = find_parent_nodes(ctx, NULL);
 	if (ret < 0 && ret != -ENOENT) {
-		free_leaf_list(*leafs);
+		free_leaf_list(ctx->refs);
+		ctx->refs = NULL;
 		return ret;
 	}
 
@@ -1681,7 +1664,7 @@ int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
 }
 
 /*
- * walk all backrefs for a given extent to find all roots that reference this
+ * Walk all backrefs for a given extent to find all roots that reference this
  * extent. Walking a backref means finding all extents that reference this
  * extent and in turn walk the backrefs of those, too. Naturally this is a
  * recursive process, but here it is implemented in an iterative fashion: We
@@ -1689,62 +1672,74 @@ int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
  * list. In turn, we find all referencing extents for those, further appending
  * to the list. The way we iterate the list allows adding more elements after
  * the current while iterating. The process stops when we reach the end of the
- * list. Found roots are added to the roots list.
+ * list.
+ *
+ * Found roots are added to @ctx->roots, which is allocated by this function and
+ * @ctx->roots should be NULL when calling this function. This function also
+ * requires @ctx->refs to be NULL, as it uses it for allocating a ulist to do
+ * temporary work, and frees it before returning.
  *
- * returns 0 on success, < 0 on error.
+ * Returns 0 on success, < 0 on error. On error @ctx->roots is always NULL.
  */
-static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans,
-				     struct btrfs_fs_info *fs_info, u64 bytenr,
-				     u64 time_seq, struct ulist **roots)
+static int btrfs_find_all_roots_safe(struct btrfs_backref_walk_ctx *ctx)
 {
-	struct ulist *tmp;
-	struct ulist_node *node = NULL;
+	const u64 orig_bytenr = ctx->bytenr;
+	const bool orig_ignore_extent_item_pos = ctx->ignore_extent_item_pos;
 	struct ulist_iterator uiter;
-	int ret;
+	int ret = 0;
 
-	tmp = ulist_alloc(GFP_NOFS);
-	if (!tmp)
+	ASSERT(ctx->refs == NULL);
+	ASSERT(ctx->roots == NULL);
+
+	ctx->refs = ulist_alloc(GFP_NOFS);
+	if (!ctx->refs)
 		return -ENOMEM;
-	*roots = ulist_alloc(GFP_NOFS);
-	if (!*roots) {
-		ulist_free(tmp);
+
+	ctx->roots = ulist_alloc(GFP_NOFS);
+	if (!ctx->roots) {
+		ulist_free(ctx->refs);
+		ctx->refs = NULL;
 		return -ENOMEM;
 	}
 
+	ctx->ignore_extent_item_pos = true;
+
 	ULIST_ITER_INIT(&uiter);
 	while (1) {
-		ret = find_parent_nodes(trans, fs_info, bytenr, time_seq,
-					tmp, *roots, BTRFS_IGNORE_EXTENT_OFFSET,
-					NULL);
+		struct ulist_node *node;
+
+		ret = find_parent_nodes(ctx, NULL);
 		if (ret < 0 && ret != -ENOENT) {
-			ulist_free(tmp);
-			ulist_free(*roots);
-			*roots = NULL;
-			return ret;
+			ulist_free(ctx->roots);
+			ctx->roots = NULL;
+			break;
 		}
-		node = ulist_next(tmp, &uiter);
+		ret = 0;
+		node = ulist_next(ctx->refs, &uiter);
 		if (!node)
 			break;
-		bytenr = node->val;
+		ctx->bytenr = node->val;
 		cond_resched();
 	}
 
-	ulist_free(tmp);
-	return 0;
+	ulist_free(ctx->refs);
+	ctx->refs = NULL;
+	ctx->bytenr = orig_bytenr;
+	ctx->ignore_extent_item_pos = orig_ignore_extent_item_pos;
+
+	return ret;
 }
 
-int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
-			 struct btrfs_fs_info *fs_info, u64 bytenr,
-			 u64 time_seq, struct ulist **roots,
+int btrfs_find_all_roots(struct btrfs_backref_walk_ctx *ctx,
 			 bool skip_commit_root_sem)
 {
 	int ret;
 
-	if (!trans && !skip_commit_root_sem)
-		down_read(&fs_info->commit_root_sem);
-	ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr, time_seq, roots);
-	if (!trans && !skip_commit_root_sem)
-		up_read(&fs_info->commit_root_sem);
+	if (!ctx->trans && !skip_commit_root_sem)
+		down_read(&ctx->fs_info->commit_root_sem);
+	ret = btrfs_find_all_roots_safe(ctx);
+	if (!ctx->trans && !skip_commit_root_sem)
+		up_read(&ctx->fs_info->commit_root_sem);
 	return ret;
 }
 
@@ -1794,6 +1789,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
 				u64 extent_gen,
 				struct btrfs_backref_share_check_ctx *ctx)
 {
+	struct btrfs_backref_walk_ctx walk_ctx = { 0 };
 	struct btrfs_root *root = inode->root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_trans_handle *trans;
@@ -1830,8 +1826,14 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
 		down_read(&fs_info->commit_root_sem);
 	} else {
 		btrfs_get_tree_mod_seq(fs_info, &elem);
+		walk_ctx.time_seq = elem.seq;
 	}
 
+	walk_ctx.ignore_extent_item_pos = true;
+	walk_ctx.trans = trans;
+	walk_ctx.fs_info = fs_info;
+	walk_ctx.refs = &ctx->refs;
+
 	/* -1 means we are in the bytenr of the data extent. */
 	level = -1;
 	ULIST_ITER_INIT(&uiter);
@@ -1840,8 +1842,8 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
 		bool is_shared;
 		bool cached;
 
-		ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, &ctx->refs,
-					NULL, BTRFS_IGNORE_EXTENT_OFFSET, &shared);
+		walk_ctx.bytenr = bytenr;
+		ret = find_parent_nodes(&walk_ctx, &shared);
 		if (ret == BACKREF_FOUND_SHARED ||
 		    ret == BACKREF_FOUND_NOT_SHARED) {
 			/* If shared must return 1, otherwise return 0. */
@@ -2279,73 +2281,81 @@ static int iterate_leaf_refs(struct btrfs_fs_info *fs_info,
  * the given parameters.
  * when the iterator function returns a non-zero value, iteration stops.
  */
-int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
-				u64 extent_item_objectid, u64 extent_item_pos,
-				int search_commit_root,
-				iterate_extent_inodes_t *iterate, void *ctx)
+int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx,
+			  bool search_commit_root,
+			  iterate_extent_inodes_t *iterate, void *user_ctx)
 {
 	int ret;
-	struct btrfs_trans_handle *trans = NULL;
-	struct ulist *refs = NULL;
-	struct ulist *roots = NULL;
-	struct ulist_node *ref_node = NULL;
-	struct ulist_node *root_node = NULL;
+	struct ulist *refs;
+	struct ulist_node *ref_node;
 	struct btrfs_seq_list seq_elem = BTRFS_SEQ_LIST_INIT(seq_elem);
 	struct ulist_iterator ref_uiter;
-	struct ulist_iterator root_uiter;
 
-	btrfs_debug(fs_info, "resolving all inodes for extent %llu",
-			extent_item_objectid);
+	btrfs_debug(ctx->fs_info, "resolving all inodes for extent %llu",
+		    ctx->bytenr);
+
+	ASSERT(ctx->trans == NULL);
 
 	if (!search_commit_root) {
-		trans = btrfs_attach_transaction(fs_info->tree_root);
+		struct btrfs_trans_handle *trans;
+
+		trans = btrfs_attach_transaction(ctx->fs_info->tree_root);
 		if (IS_ERR(trans)) {
 			if (PTR_ERR(trans) != -ENOENT &&
 			    PTR_ERR(trans) != -EROFS)
 				return PTR_ERR(trans);
 			trans = NULL;
 		}
+		ctx->trans = trans;
 	}
 
-	if (trans)
-		btrfs_get_tree_mod_seq(fs_info, &seq_elem);
-	else
-		down_read(&fs_info->commit_root_sem);
+	if (ctx->trans) {
+		btrfs_get_tree_mod_seq(ctx->fs_info, &seq_elem);
+		ctx->time_seq = seq_elem.seq;
+	} else {
+		down_read(&ctx->fs_info->commit_root_sem);
+	}
 
-	ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
-				   seq_elem.seq, &refs, extent_item_pos);
+	ret = btrfs_find_all_leafs(ctx);
 	if (ret)
 		goto out;
+	refs = ctx->refs;
+	ctx->refs = NULL;
 
 	ULIST_ITER_INIT(&ref_uiter);
 	while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
-		ret = btrfs_find_all_roots_safe(trans, fs_info, ref_node->val,
-						seq_elem.seq, &roots);
+		struct ulist_node *root_node;
+		struct ulist_iterator root_uiter;
+
+		ctx->bytenr = ref_node->val;
+		ret = btrfs_find_all_roots_safe(ctx);
 		if (ret)
 			break;
+
 		ULIST_ITER_INIT(&root_uiter);
-		while (!ret && (root_node = ulist_next(roots, &root_uiter))) {
-			btrfs_debug(fs_info,
+		while (!ret && (root_node = ulist_next(ctx->roots, &root_uiter))) {
+			btrfs_debug(ctx->fs_info,
 				    "root %llu references leaf %llu, data list %#llx",
 				    root_node->val, ref_node->val,
 				    ref_node->aux);
-			ret = iterate_leaf_refs(fs_info,
+			ret = iterate_leaf_refs(ctx->fs_info,
 						(struct extent_inode_elem *)
 						(uintptr_t)ref_node->aux,
-						root_node->val,
-						extent_item_objectid,
-						iterate, ctx);
+						root_node->val, ctx->bytenr,
+						iterate, user_ctx);
 		}
-		ulist_free(roots);
+		ulist_free(ctx->roots);
+		ctx->roots = NULL;
 	}
 
 	free_leaf_list(refs);
 out:
-	if (trans) {
-		btrfs_put_tree_mod_seq(fs_info, &seq_elem);
-		btrfs_end_transaction(trans);
+	if (ctx->trans) {
+		btrfs_put_tree_mod_seq(ctx->fs_info, &seq_elem);
+		btrfs_end_transaction(ctx->trans);
+		ctx->trans = NULL;
 	} else {
-		up_read(&fs_info->commit_root_sem);
+		up_read(&ctx->fs_info->commit_root_sem);
 	}
 
 	return ret;
@@ -2375,8 +2385,8 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
 				struct btrfs_path *path,
 				void *ctx, bool ignore_offset)
 {
+	struct btrfs_backref_walk_ctx walk_ctx = { 0 };
 	int ret;
-	u64 extent_item_pos;
 	u64 flags = 0;
 	struct btrfs_key found_key;
 	int search_commit_root = path->search_commit_root;
@@ -2388,16 +2398,15 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
 	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
 		return -EINVAL;
 
+	walk_ctx.bytenr = found_key.objectid;
 	if (ignore_offset)
-		extent_item_pos = BTRFS_IGNORE_EXTENT_OFFSET;
+		walk_ctx.ignore_extent_item_pos = true;
 	else
-		extent_item_pos = logical - found_key.objectid;
+		walk_ctx.extent_item_pos = logical - found_key.objectid;
+	walk_ctx.fs_info = fs_info;
 
-	ret = iterate_extent_inodes(fs_info, found_key.objectid,
-					extent_item_pos, search_commit_root,
-					build_ino_list, ctx);
-
-	return ret;
+	return iterate_extent_inodes(&walk_ctx, search_commit_root,
+				     build_ino_list, ctx);
 }
 
 static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 2eb99f23cc8f..ce700dfcc016 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -13,11 +13,63 @@
 #include "extent_io.h"
 
 /*
- * Pass to backref walking functions to tell them to include references from
- * all file extent items that point to the target data extent, regardless if
- * they refer to the whole extent or just sections of it (bookend extents).
+ * Context and arguments for backref walking functions. Some of the fields are
+ * to be filled by the caller of such functions while other are filled by the
+ * functions themselves, as described below.
  */
-#define BTRFS_IGNORE_EXTENT_OFFSET   ((u64)-1)
+struct btrfs_backref_walk_ctx {
+	/*
+	 * The address of the extent for which we are doing backref walking.
+	 * Can be either a data extent or a metadata extent.
+	 *
+	 * Must always be set by the top level caller.
+	 */
+	u64 bytenr;
+	/*
+	 * Offset relative to the target extent. This is only used for data
+	 * extents, and it's meaningful because we can have file extent items
+	 * that point only to a section of a data extent ("bookend" extents),
+	 * and we want to filter out any that don't point to a section of the
+	 * data extent containing the given offset.
+	 *
+	 * Must always be set by the top level caller.
+	 */
+	u64 extent_item_pos;
+	/*
+	 * If true and bytenr corresponds to a data extent, then references from
+	 * all file extent items that point to the data extent are considered,
+	 * @extent_item_pos is ignored.
+	 */
+	bool ignore_extent_item_pos;
+	/* A valid transaction handle or NULL. */
+	struct btrfs_trans_handle *trans;
+	/*
+	 * The file system's info object, can not be NULL.
+	 *
+	 * Must always be set by the top level caller.
+	 */
+	struct btrfs_fs_info *fs_info;
+	/*
+	 * Time sequence acquired from btrfs_get_tree_mod_seq(), in case the
+	 * caller joined the tree mod log to get a consistent view of b+trees
+	 * while we do backref walking, or BTRFS_SEQ_LAST.
+	 * When using BTRFS_SEQ_LAST, delayed refs are not checked and it uses
+	 * commit roots when searching b+trees - this is a special case for
+	 * qgroups used during a transaction commit.
+	 */
+	u64 time_seq;
+	/*
+	 * Used to collect the bytenr of metadata extents that point to the
+	 * target extent.
+	 */
+	struct ulist *refs;
+	/*
+	 * List used to collect the IDs of the roots from which the target
+	 * extent is accessible. Can be NULL in case the caller does not care
+	 * about collecting root IDs.
+	 */
+	struct ulist *roots;
+};
 
 struct inode_fs_paths {
 	struct btrfs_path		*btrfs_path;
@@ -96,10 +148,9 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
 			    struct btrfs_key *key, struct btrfs_extent_item *ei,
 			    u32 item_size, u64 *out_root, u8 *out_level);
 
-int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
-				u64 extent_item_objectid,
-				u64 extent_offset, int search_commit_root,
-				iterate_extent_inodes_t *iterate, void *ctx);
+int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx,
+			  bool search_commit_root,
+			  iterate_extent_inodes_t *iterate, void *user_ctx);
 
 int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
 				struct btrfs_path *path, void *ctx,
@@ -107,13 +158,8 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
 
 int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
 
-int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
-			 struct btrfs_fs_info *fs_info, u64 bytenr,
-			 u64 time_seq, struct ulist **leafs,
-			 u64 extent_item_pos);
-int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
-			 struct btrfs_fs_info *fs_info, u64 bytenr,
-			 u64 time_seq, struct ulist **roots,
+int btrfs_find_all_leafs(struct btrfs_backref_walk_ctx *ctx);
+int btrfs_find_all_roots(struct btrfs_backref_walk_ctx *ctx,
 			 bool skip_commit_root_sem);
 char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
 			u32 name_len, unsigned long name_off,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 75dd964ca46c..24c013c61a94 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1794,8 +1794,7 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
 int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
 				   struct btrfs_qgroup_extent_record *qrecord)
 {
-	struct ulist *old_root;
-	u64 bytenr = qrecord->bytenr;
+	struct btrfs_backref_walk_ctx ctx = { 0 };
 	int ret;
 
 	/*
@@ -1822,8 +1821,10 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
 	if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
 		return 0;
 
-	ret = btrfs_find_all_roots(NULL, trans->fs_info, bytenr, 0, &old_root,
-				   true);
+	ctx.bytenr = qrecord->bytenr;
+	ctx.fs_info = trans->fs_info;
+
+	ret = btrfs_find_all_roots(&ctx, true);
 	if (ret < 0) {
 		qgroup_mark_inconsistent(trans->fs_info);
 		btrfs_warn(trans->fs_info,
@@ -1839,7 +1840,7 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
 	 *
 	 * So modifying qrecord->old_roots is safe here
 	 */
-	qrecord->old_roots = old_root;
+	qrecord->old_roots = ctx.roots;
 	return 0;
 }
 
@@ -2750,17 +2751,22 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
 
 		if (!ret && !(fs_info->qgroup_flags &
 			      BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) {
+			struct btrfs_backref_walk_ctx ctx = { 0 };
+
+			ctx.bytenr = record->bytenr;
+			ctx.fs_info = fs_info;
+
 			/*
 			 * Old roots should be searched when inserting qgroup
 			 * extent record
 			 */
 			if (WARN_ON(!record->old_roots)) {
 				/* Search commit root to find old_roots */
-				ret = btrfs_find_all_roots(NULL, fs_info,
-						record->bytenr, 0,
-						&record->old_roots, false);
+				ret = btrfs_find_all_roots(&ctx, false);
 				if (ret < 0)
 					goto cleanup;
+				record->old_roots = ctx.roots;
+				ctx.roots = NULL;
 			}
 
 			/* Free the reserved data space */
@@ -2773,10 +2779,11 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
 			 * which doesn't lock tree or delayed_refs and search
 			 * current root. It's safe inside commit_transaction().
 			 */
-			ret = btrfs_find_all_roots(trans, fs_info,
-			   record->bytenr, BTRFS_SEQ_LAST, &new_roots, false);
+			ctx.trans = trans;
+			ret = btrfs_find_all_roots(&ctx, false);
 			if (ret < 0)
 				goto cleanup;
+			new_roots = ctx.roots;
 			if (qgroup_to_skip) {
 				ulist_del(new_roots, qgroup_to_skip, 0);
 				ulist_del(record->old_roots, qgroup_to_skip,
@@ -3242,7 +3249,6 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
 	struct btrfs_root *extent_root;
 	struct btrfs_key found;
 	struct extent_buffer *scratch_leaf = NULL;
-	struct ulist *roots = NULL;
 	u64 num_bytes;
 	bool done;
 	int slot;
@@ -3292,6 +3298,8 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
 	mutex_unlock(&fs_info->qgroup_rescan_lock);
 
 	for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
+		struct btrfs_backref_walk_ctx ctx = { 0 };
+
 		btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
 		if (found.type != BTRFS_EXTENT_ITEM_KEY &&
 		    found.type != BTRFS_METADATA_ITEM_KEY)
@@ -3301,13 +3309,15 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
 		else
 			num_bytes = found.offset;
 
-		ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
-					   &roots, false);
+		ctx.bytenr = found.objectid;
+		ctx.fs_info = fs_info;
+
+		ret = btrfs_find_all_roots(&ctx, false);
 		if (ret < 0)
 			goto out;
 		/* For rescan, just pass old_roots as NULL */
 		ret = btrfs_qgroup_account_extent(trans, found.objectid,
-						  num_bytes, NULL, roots);
+						  num_bytes, NULL, ctx.roots);
 		if (ret < 0)
 			goto out;
 	}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 45690f7b5900..2ecca24e1001 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3408,24 +3408,27 @@ int add_data_references(struct reloc_control *rc,
 			struct btrfs_path *path,
 			struct rb_root *blocks)
 {
-	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
-	struct ulist *leaves = NULL;
+	struct btrfs_backref_walk_ctx ctx = { 0 };
 	struct ulist_iterator leaf_uiter;
 	struct ulist_node *ref_node = NULL;
-	const u32 blocksize = fs_info->nodesize;
+	const u32 blocksize = rc->extent_root->fs_info->nodesize;
 	int ret = 0;
 
 	btrfs_release_path(path);
-	ret = btrfs_find_all_leafs(NULL, fs_info, extent_key->objectid,
-				   0, &leaves, BTRFS_IGNORE_EXTENT_OFFSET);
+
+	ctx.bytenr = extent_key->objectid;
+	ctx.ignore_extent_item_pos = true;
+	ctx.fs_info = rc->extent_root->fs_info;
+
+	ret = btrfs_find_all_leafs(&ctx);
 	if (ret < 0)
 		return ret;
 
 	ULIST_ITER_INIT(&leaf_uiter);
-	while ((ref_node = ulist_next(leaves, &leaf_uiter))) {
+	while ((ref_node = ulist_next(ctx.refs, &leaf_uiter))) {
 		struct extent_buffer *eb;
 
-		eb = read_tree_block(fs_info, ref_node->val, 0, 0, 0, NULL);
+		eb = read_tree_block(ctx.fs_info, ref_node->val, 0, 0, 0, NULL);
 		if (IS_ERR(eb)) {
 			ret = PTR_ERR(eb);
 			break;
@@ -3441,7 +3444,7 @@ int add_data_references(struct reloc_control *rc,
 	}
 	if (ret < 0)
 		free_block_list(blocks);
-	ulist_free(leaves);
+	ulist_free(ctx.refs);
 	return ret;
 }
 
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 6c7dc89709fc..288a97992aa4 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -909,7 +909,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 	struct btrfs_extent_item *ei;
 	struct scrub_warning swarn;
 	unsigned long ptr = 0;
-	u64 extent_item_pos;
 	u64 flags = 0;
 	u64 ref_root;
 	u32 item_size;
@@ -941,7 +940,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 	if (ret < 0)
 		goto out;
 
-	extent_item_pos = swarn.logical - found_key.objectid;
 	swarn.extent_item_size = found_key.offset;
 
 	eb = path->nodes[0];
@@ -964,12 +962,18 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 		} while (ret != 1);
 		btrfs_release_path(path);
 	} else {
+		struct btrfs_backref_walk_ctx ctx = { 0 };
+
 		btrfs_release_path(path);
+
+		ctx.bytenr = found_key.objectid;
+		ctx.extent_item_pos = swarn.logical - found_key.objectid;
+		ctx.fs_info = fs_info;
+
 		swarn.path = path;
 		swarn.dev = dev;
-		iterate_extent_inodes(fs_info, found_key.objectid,
-					extent_item_pos, 1,
-					scrub_print_warning_inode, &swarn);
+
+		iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn);
 	}
 
 out:
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 6bf06939f891..0c9a9933341e 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1356,12 +1356,12 @@ static int find_extent_clone(struct send_ctx *sctx,
 	u64 logical;
 	u64 disk_byte;
 	u64 num_bytes;
-	u64 extent_item_pos;
 	u64 extent_refs;
 	u64 flags = 0;
 	struct btrfs_file_extent_item *fi;
 	struct extent_buffer *eb = path->nodes[0];
-	struct backref_ctx backref_ctx = {0};
+	struct backref_ctx backref_ctx = { 0 };
+	struct btrfs_backref_walk_ctx backref_walk_ctx = { 0 };
 	struct clone_root *cur_clone_root;
 	struct btrfs_key found_key;
 	struct btrfs_path *tmp_path;
@@ -1461,14 +1461,13 @@ static int find_extent_clone(struct send_ctx *sctx,
 	/*
 	 * Now collect all backrefs.
 	 */
+	backref_walk_ctx.bytenr = found_key.objectid;
 	if (compressed == BTRFS_COMPRESS_NONE)
-		extent_item_pos = logical - found_key.objectid;
-	else
-		extent_item_pos = 0;
-	ret = iterate_extent_inodes(fs_info, found_key.objectid,
-				    extent_item_pos, 1, __iterate_backrefs,
-				    &backref_ctx);
+		backref_walk_ctx.extent_item_pos = logical - found_key.objectid;
+	backref_walk_ctx.fs_info = fs_info;
 
+	ret = iterate_extent_inodes(&backref_walk_ctx, true, __iterate_backrefs,
+				    &backref_ctx);
 	if (ret < 0)
 		goto out;
 
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 65b65d55d1f6..3fc8dc3fd980 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -205,6 +205,7 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
 static int test_no_shared_qgroup(struct btrfs_root *root,
 		u32 sectorsize, u32 nodesize)
 {
+	struct btrfs_backref_walk_ctx ctx = { 0 };
 	struct btrfs_trans_handle trans;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct ulist *old_roots = NULL;
@@ -220,16 +221,22 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
 		return ret;
 	}
 
+	ctx.bytenr = nodesize;
+	ctx.trans = &trans;
+	ctx.fs_info = fs_info;
+
 	/*
 	 * Since the test trans doesn't have the complicated delayed refs,
 	 * we can only call btrfs_qgroup_account_extent() directly to test
 	 * quota.
 	 */
-	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
+	ret = btrfs_find_all_roots(&ctx, false);
 	if (ret) {
 		test_err("couldn't find old roots: %d", ret);
 		return ret;
 	}
+	old_roots = ctx.roots;
+	ctx.roots = NULL;
 
 	ret = insert_normal_tree_ref(root, nodesize, nodesize, 0,
 				BTRFS_FS_TREE_OBJECTID);
@@ -238,12 +245,14 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
 		return ret;
 	}
 
-	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
+	ret = btrfs_find_all_roots(&ctx, false);
 	if (ret) {
 		ulist_free(old_roots);
 		test_err("couldn't find old roots: %d", ret);
 		return ret;
 	}
+	new_roots = ctx.roots;
+	ctx.roots = NULL;
 
 	ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots,
 					  new_roots);
@@ -262,11 +271,13 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
 		return -EINVAL;
 	}
 
-	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
+	ret = btrfs_find_all_roots(&ctx, false);
 	if (ret) {
 		test_err("couldn't find old roots: %d", ret);
 		return ret;
 	}
+	old_roots = ctx.roots;
+	ctx.roots = NULL;
 
 	ret = remove_extent_item(root, nodesize, nodesize);
 	if (ret) {
@@ -274,12 +285,14 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
 		return -EINVAL;
 	}
 
-	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
+	ret = btrfs_find_all_roots(&ctx, false);
 	if (ret) {
 		ulist_free(old_roots);
 		test_err("couldn't find old roots: %d", ret);
 		return ret;
 	}
+	new_roots = ctx.roots;
+	ctx.roots = NULL;
 
 	ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots,
 					  new_roots);
@@ -304,6 +317,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
 static int test_multiple_refs(struct btrfs_root *root,
 		u32 sectorsize, u32 nodesize)
 {
+	struct btrfs_backref_walk_ctx ctx = { 0 };
 	struct btrfs_trans_handle trans;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct ulist *old_roots = NULL;
@@ -324,11 +338,17 @@ static int test_multiple_refs(struct btrfs_root *root,
 		return ret;
 	}
 
-	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
+	ctx.bytenr = nodesize;
+	ctx.trans = &trans;
+	ctx.fs_info = fs_info;
+
+	ret = btrfs_find_all_roots(&ctx, false);
 	if (ret) {
 		test_err("couldn't find old roots: %d", ret);
 		return ret;
 	}
+	old_roots = ctx.roots;
+	ctx.roots = NULL;
 
 	ret = insert_normal_tree_ref(root, nodesize, nodesize, 0,
 				BTRFS_FS_TREE_OBJECTID);
@@ -337,12 +357,14 @@ static int test_multiple_refs(struct btrfs_root *root,
 		return ret;
 	}
 
-	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
+	ret = btrfs_find_all_roots(&ctx, false);
 	if (ret) {
 		ulist_free(old_roots);
 		test_err("couldn't find old roots: %d", ret);
 		return ret;
 	}
+	new_roots = ctx.roots;
+	ctx.roots = NULL;
 
 	ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots,
 					  new_roots);
@@ -357,11 +379,13 @@ static int test_multiple_refs(struct btrfs_root *root,
 		return -EINVAL;
 	}
 
-	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
+	ret = btrfs_find_all_roots(&ctx, false);
 	if (ret) {
 		test_err("couldn't find old roots: %d", ret);
 		return ret;
 	}
+	old_roots = ctx.roots;
+	ctx.roots = NULL;
 
 	ret = add_tree_ref(root, nodesize, nodesize, 0,
 			BTRFS_FIRST_FREE_OBJECTID);
@@ -370,12 +394,14 @@ static int test_multiple_refs(struct btrfs_root *root,
 		return ret;
 	}
 
-	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
+	ret = btrfs_find_all_roots(&ctx, false);
 	if (ret) {
 		ulist_free(old_roots);
 		test_err("couldn't find old roots: %d", ret);
 		return ret;
 	}
+	new_roots = ctx.roots;
+	ctx.roots = NULL;
 
 	ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots,
 					  new_roots);
@@ -396,11 +422,13 @@ static int test_multiple_refs(struct btrfs_root *root,
 		return -EINVAL;
 	}
 
-	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
+	ret = btrfs_find_all_roots(&ctx, false);
 	if (ret) {
 		test_err("couldn't find old roots: %d", ret);
 		return ret;
 	}
+	old_roots = ctx.roots;
+	ctx.roots = NULL;
 
 	ret = remove_extent_ref(root, nodesize, nodesize, 0,
 				BTRFS_FIRST_FREE_OBJECTID);
@@ -409,12 +437,14 @@ static int test_multiple_refs(struct btrfs_root *root,
 		return ret;
 	}
 
-	ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
+	ret = btrfs_find_all_roots(&ctx, false);
 	if (ret) {
 		ulist_free(old_roots);
 		test_err("couldn't find old roots: %d", ret);
 		return ret;
 	}
+	new_roots = ctx.roots;
+	ctx.roots = NULL;
 
 	ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots,
 					  new_roots);
-- 
cgit 


From 1baea6f18abf34037169d3b0c585356abb395376 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 1 Nov 2022 16:15:48 +0000
Subject: btrfs: reuse roots ulist on each leaf iteration for
 iterate_extent_inodes()

At iterate_extent_inodes() we collect a ulist of leaves for a given extent
with a call to btrfs_find_all_leafs() and then we enter a loop where we
iterate over all the collected leaves. Each iteration of that loop does a
call to btrfs_find_all_roots_safe(), to determine all roots from which a
leaf is accessible, and that results in allocating and releasing a ulist
to store the root IDs.

Instead of allocating and releasing the roots ulist on every iteration,
allocate a ulist before entering the loop and keep using it on each
iteration, reinitializing the ulist at the end of each iteration.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 46 +++++++++++++++++++++++++++++++---------------
 1 file changed, 31 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index a1a00a7bdba5..dc276ce3afe0 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1674,32 +1674,36 @@ int btrfs_find_all_leafs(struct btrfs_backref_walk_ctx *ctx)
  * the current while iterating. The process stops when we reach the end of the
  * list.
  *
- * Found roots are added to @ctx->roots, which is allocated by this function and
- * @ctx->roots should be NULL when calling this function. This function also
- * requires @ctx->refs to be NULL, as it uses it for allocating a ulist to do
- * temporary work, and frees it before returning.
+ * Found roots are added to @ctx->roots, which is allocated by this function if
+ * it points to NULL, in which case the caller is responsible for freeing it
+ * after it's not needed anymore.
+ * This function requires @ctx->refs to be NULL, as it uses it for allocating a
+ * ulist to do temporary work, and frees it before returning.
  *
- * Returns 0 on success, < 0 on error. On error @ctx->roots is always NULL.
+ * Returns 0 on success, < 0 on error.
  */
 static int btrfs_find_all_roots_safe(struct btrfs_backref_walk_ctx *ctx)
 {
 	const u64 orig_bytenr = ctx->bytenr;
 	const bool orig_ignore_extent_item_pos = ctx->ignore_extent_item_pos;
+	bool roots_ulist_allocated = false;
 	struct ulist_iterator uiter;
 	int ret = 0;
 
 	ASSERT(ctx->refs == NULL);
-	ASSERT(ctx->roots == NULL);
 
 	ctx->refs = ulist_alloc(GFP_NOFS);
 	if (!ctx->refs)
 		return -ENOMEM;
 
-	ctx->roots = ulist_alloc(GFP_NOFS);
 	if (!ctx->roots) {
-		ulist_free(ctx->refs);
-		ctx->refs = NULL;
-		return -ENOMEM;
+		ctx->roots = ulist_alloc(GFP_NOFS);
+		if (!ctx->roots) {
+			ulist_free(ctx->refs);
+			ctx->refs = NULL;
+			return -ENOMEM;
+		}
+		roots_ulist_allocated = true;
 	}
 
 	ctx->ignore_extent_item_pos = true;
@@ -1710,8 +1714,10 @@ static int btrfs_find_all_roots_safe(struct btrfs_backref_walk_ctx *ctx)
 
 		ret = find_parent_nodes(ctx, NULL);
 		if (ret < 0 && ret != -ENOENT) {
-			ulist_free(ctx->roots);
-			ctx->roots = NULL;
+			if (roots_ulist_allocated) {
+				ulist_free(ctx->roots);
+				ctx->roots = NULL;
+			}
 			break;
 		}
 		ret = 0;
@@ -2295,6 +2301,11 @@ int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx,
 		    ctx->bytenr);
 
 	ASSERT(ctx->trans == NULL);
+	ASSERT(ctx->roots == NULL);
+
+	ctx->roots = ulist_alloc(GFP_NOFS);
+	if (!ctx->roots)
+		return -ENOMEM;
 
 	if (!search_commit_root) {
 		struct btrfs_trans_handle *trans;
@@ -2302,8 +2313,11 @@ int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx,
 		trans = btrfs_attach_transaction(ctx->fs_info->tree_root);
 		if (IS_ERR(trans)) {
 			if (PTR_ERR(trans) != -ENOENT &&
-			    PTR_ERR(trans) != -EROFS)
+			    PTR_ERR(trans) != -EROFS) {
+				ulist_free(ctx->roots);
+				ctx->roots = NULL;
 				return PTR_ERR(trans);
+			}
 			trans = NULL;
 		}
 		ctx->trans = trans;
@@ -2344,8 +2358,7 @@ int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx,
 						root_node->val, ctx->bytenr,
 						iterate, user_ctx);
 		}
-		ulist_free(ctx->roots);
-		ctx->roots = NULL;
+		ulist_reinit(ctx->roots);
 	}
 
 	free_leaf_list(refs);
@@ -2358,6 +2371,9 @@ out:
 		up_read(&ctx->fs_info->commit_root_sem);
 	}
 
+	ulist_free(ctx->roots);
+	ctx->roots = NULL;
+
 	return ret;
 }
 
-- 
cgit 


From fa104a879073f3974d673ad8c609d986042cf666 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 1 Nov 2022 16:15:49 +0000
Subject: btrfs: constify ulist parameter of ulist_next()

The ulist_next() iterator function does not need to change the given ulist
so make it const. This will allow the next patch in the series to pass a
ulist to a function that does not need, and should not, modify the ulist.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ulist.c | 2 +-
 fs/btrfs/ulist.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 13af1e41f9d7..33606025513d 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -266,7 +266,7 @@ int ulist_del(struct ulist *ulist, u64 val, u64 aux)
  * It is allowed to call ulist_add during an enumeration. Newly added items
  * are guaranteed to show up in the running enumeration.
  */
-struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter)
+struct ulist_node *ulist_next(const struct ulist *ulist, struct ulist_iterator *uiter)
 {
 	struct ulist_node *node;
 
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index 02fda0a2d4ce..b2cef187ea8e 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -66,7 +66,7 @@ static inline int ulist_add_merge_ptr(struct ulist *ulist, u64 val, void *aux,
 #endif
 }
 
-struct ulist_node *ulist_next(struct ulist *ulist,
+struct ulist_node *ulist_next(const struct ulist *ulist,
 			      struct ulist_iterator *uiter);
 
 #define ULIST_ITER_INIT(uiter) ((uiter)->cur_list = NULL)
-- 
cgit 


From 66d04209e5a8d3fc47900de6bd0c319790a52b3e Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 1 Nov 2022 16:15:50 +0000
Subject: btrfs: send: cache leaf to roots mapping during backref walking

During a send operation, when doing backref walking to determine which
inodes/offsets/roots we can clone from, the most repetitive and expensive
step is to map each leaf that has file extent items pointing to the target
data extent to the IDs of the roots from which the leaves are accessible,
which happens at iterate_extent_inodes(). That step requires finding every
parent node of a leaf, then the parent of each parent, and so on until we
reach a root node. So it's a naturally expensive operation, and repetitive
because each leaf can have hundreds of file extent items (for a nodesize
of 16K, that can be slightly over 200 file extent items). There's also
temporal locality, as we process all file extent items from a leave before
moving the next leaf.

This change caches the mapping of leaves to root IDs, to avoid repeating
those computations over and over again. The cache is limited to a maximum
of 128 entries, with each entry being a struct with a size of 128 bytes,
so the maximum cache size is 16K plus any nodes internally allocated by
the maple tree that is used to index pointers to those structs. The cache
is invalidated whenever we detect relocation happened since we started
filling the cache, because if relocation happened then extent buffers for
leaves and nodes of the trees used by a send operation may have been
reallocated.

This cache also allows for another important optimization that is
introduced in the next patch in the series.

This change is part of a patchset comprised of the following patches:

  01/17 btrfs: fix inode list leak during backref walking at resolve_indirect_refs()
  02/17 btrfs: fix inode list leak during backref walking at find_parent_nodes()
  03/17 btrfs: fix ulist leaks in error paths of qgroup self tests
  04/17 btrfs: remove pointless and double ulist frees in error paths of qgroup tests
  05/17 btrfs: send: avoid unnecessary path allocations when finding extent clone
  06/17 btrfs: send: update comment at find_extent_clone()
  07/17 btrfs: send: drop unnecessary backref context field initializations
  08/17 btrfs: send: avoid unnecessary backref lookups when finding clone source
  09/17 btrfs: send: optimize clone detection to increase extent sharing
  10/17 btrfs: use a single argument for extent offset in backref walking functions
  11/17 btrfs: use a structure to pass arguments to backref walking functions
  12/17 btrfs: reuse roots ulist on each leaf iteration for iterate_extent_inodes()
  13/17 btrfs: constify ulist parameter of ulist_next()
  14/17 btrfs: send: cache leaf to roots mapping during backref walking
  15/17 btrfs: send: skip unnecessary backref iterations
  16/17 btrfs: send: avoid double extent tree search when finding clone source
  17/17 btrfs: send: skip resolution of our own backref when finding clone source

Performance test results are in the changelog of patch 17/17.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c |  52 +++++++++++----
 fs/btrfs/backref.h |  11 ++++
 fs/btrfs/send.c    | 185 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 236 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index dc276ce3afe0..9dacf487017d 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -2303,21 +2303,14 @@ int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx,
 	ASSERT(ctx->trans == NULL);
 	ASSERT(ctx->roots == NULL);
 
-	ctx->roots = ulist_alloc(GFP_NOFS);
-	if (!ctx->roots)
-		return -ENOMEM;
-
 	if (!search_commit_root) {
 		struct btrfs_trans_handle *trans;
 
 		trans = btrfs_attach_transaction(ctx->fs_info->tree_root);
 		if (IS_ERR(trans)) {
 			if (PTR_ERR(trans) != -ENOENT &&
-			    PTR_ERR(trans) != -EROFS) {
-				ulist_free(ctx->roots);
-				ctx->roots = NULL;
+			    PTR_ERR(trans) != -EROFS)
 				return PTR_ERR(trans);
-			}
 			trans = NULL;
 		}
 		ctx->trans = trans;
@@ -2338,23 +2331,58 @@ int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx,
 
 	ULIST_ITER_INIT(&ref_uiter);
 	while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
+		const u64 leaf_bytenr = ref_node->val;
 		struct ulist_node *root_node;
 		struct ulist_iterator root_uiter;
+		struct extent_inode_elem *inode_list;
+
+		inode_list = (struct extent_inode_elem *)(uintptr_t)ref_node->aux;
+
+		if (ctx->cache_lookup) {
+			const u64 *root_ids;
+			int root_count;
+			bool cached;
+
+			cached = ctx->cache_lookup(leaf_bytenr, ctx->user_ctx,
+						   &root_ids, &root_count);
+			if (cached) {
+				for (int i = 0; i < root_count; i++) {
+					ret = iterate_leaf_refs(ctx->fs_info,
+								inode_list,
+								root_ids[i],
+								leaf_bytenr,
+								iterate,
+								user_ctx);
+					if (ret)
+						break;
+				}
+				continue;
+			}
+		}
+
+		if (!ctx->roots) {
+			ctx->roots = ulist_alloc(GFP_NOFS);
+			if (!ctx->roots) {
+				ret = -ENOMEM;
+				break;
+			}
+		}
 
-		ctx->bytenr = ref_node->val;
+		ctx->bytenr = leaf_bytenr;
 		ret = btrfs_find_all_roots_safe(ctx);
 		if (ret)
 			break;
 
+		if (ctx->cache_store)
+			ctx->cache_store(leaf_bytenr, ctx->roots, ctx->user_ctx);
+
 		ULIST_ITER_INIT(&root_uiter);
 		while (!ret && (root_node = ulist_next(ctx->roots, &root_uiter))) {
 			btrfs_debug(ctx->fs_info,
 				    "root %llu references leaf %llu, data list %#llx",
 				    root_node->val, ref_node->val,
 				    ref_node->aux);
-			ret = iterate_leaf_refs(ctx->fs_info,
-						(struct extent_inode_elem *)
-						(uintptr_t)ref_node->aux,
+			ret = iterate_leaf_refs(ctx->fs_info, inode_list,
 						root_node->val, ctx->bytenr,
 						iterate, user_ctx);
 		}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index ce700dfcc016..5005f579f235 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -69,6 +69,17 @@ struct btrfs_backref_walk_ctx {
 	 * about collecting root IDs.
 	 */
 	struct ulist *roots;
+	/*
+	 * Used by iterate_extent_inodes(). Lookup and store functions for an
+	 * optional cache which maps the logical address (bytenr) of leaves
+	 * to an array of root IDs.
+	 */
+	bool (*cache_lookup)(u64 leaf_bytenr, void *user_ctx,
+			     const u64 **root_ids_ret, int *root_count_ret);
+	void (*cache_store)(u64 leaf_bytenr, const struct ulist *root_ids,
+			    void *user_ctx);
+	/* Context object to pass to @cache_lookup and @cache_store. */
+	void *user_ctx;
 };
 
 struct inode_fs_paths {
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 0c9a9933341e..fa94bd550564 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -83,6 +83,39 @@ struct clone_root {
 #define SEND_CTX_MAX_NAME_CACHE_SIZE 128
 #define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2)
 
+/*
+ * Limit the root_ids array of struct backref_cache_entry to 12 elements.
+ * This makes the size of a cache entry to be exactly 128 bytes on x86_64.
+ * The most common case is to have a single root for cloning, which corresponds
+ * to the send root. Having the user specify more than 11 clone roots is not
+ * common, and in such rare cases we simply don't use caching if the number of
+ * cloning roots that lead down to a leaf is more than 12.
+ */
+#define SEND_MAX_BACKREF_CACHE_ROOTS 12
+
+/*
+ * Max number of entries in the cache.
+ * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, the size in bytes, excluding
+ * maple tree's internal nodes, is 16K.
+ */
+#define SEND_MAX_BACKREF_CACHE_SIZE 128
+
+/*
+ * A backref cache entry maps a leaf to a list of IDs of roots from which the
+ * leaf is accessible and we can use for clone operations.
+ * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, each cache entry is 128 bytes (on
+ * x86_64).
+ */
+struct backref_cache_entry {
+	/* List to link to the cache's lru list. */
+	struct list_head list;
+	/* The key for this entry in the cache. */
+	u64 key;
+	u64 root_ids[SEND_MAX_BACKREF_CACHE_ROOTS];
+	/* Number of valid elements in the root_ids array. */
+	int num_roots;
+};
+
 struct send_ctx {
 	struct file *send_filp;
 	loff_t send_off;
@@ -251,6 +284,14 @@ struct send_ctx {
 
 	struct rb_root rbtree_new_refs;
 	struct rb_root rbtree_deleted_refs;
+
+	struct {
+		u64 last_reloc_trans;
+		struct list_head lru_list;
+		struct maple_tree entries;
+		/* Number of entries stored in the cache. */
+		int size;
+	} backref_cache;
 };
 
 struct pending_dir_move {
@@ -1335,6 +1376,142 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root,
 	return 0;
 }
 
+static void empty_backref_cache(struct send_ctx *sctx)
+{
+	struct backref_cache_entry *entry;
+	struct backref_cache_entry *tmp;
+
+	list_for_each_entry_safe(entry, tmp, &sctx->backref_cache.lru_list, list)
+		kfree(entry);
+
+	INIT_LIST_HEAD(&sctx->backref_cache.lru_list);
+	mtree_destroy(&sctx->backref_cache.entries);
+	sctx->backref_cache.size = 0;
+}
+
+static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx,
+				 const u64 **root_ids_ret, int *root_count_ret)
+{
+	struct send_ctx *sctx = ctx;
+	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
+	const u64 key = leaf_bytenr >> fs_info->sectorsize_bits;
+	struct backref_cache_entry *entry;
+
+	if (sctx->backref_cache.size == 0)
+		return false;
+
+	/*
+	 * If relocation happened since we first filled the cache, then we must
+	 * empty the cache and can not use it, because even though we operate on
+	 * read-only roots, their leaves and nodes may have been reallocated and
+	 * now be used for different nodes/leaves of the same tree or some other
+	 * tree.
+	 *
+	 * We are called from iterate_extent_inodes() while either holding a
+	 * transaction handle or holding fs_info->commit_root_sem, so no need
+	 * to take any lock here.
+	 */
+	if (fs_info->last_reloc_trans > sctx->backref_cache.last_reloc_trans) {
+		empty_backref_cache(sctx);
+		return false;
+	}
+
+	entry = mtree_load(&sctx->backref_cache.entries, key);
+	if (!entry)
+		return false;
+
+	*root_ids_ret = entry->root_ids;
+	*root_count_ret = entry->num_roots;
+	list_move_tail(&entry->list, &sctx->backref_cache.lru_list);
+
+	return true;
+}
+
+static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids,
+				void *ctx)
+{
+	struct send_ctx *sctx = ctx;
+	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
+	struct backref_cache_entry *new_entry;
+	struct ulist_iterator uiter;
+	struct ulist_node *node;
+	int ret;
+
+	/*
+	 * We're called while holding a transaction handle or while holding
+	 * fs_info->commit_root_sem (at iterate_extent_inodes()), so must do a
+	 * NOFS allocation.
+	 */
+	new_entry = kmalloc(sizeof(struct backref_cache_entry), GFP_NOFS);
+	/* No worries, cache is optional. */
+	if (!new_entry)
+		return;
+
+	new_entry->key = leaf_bytenr >> fs_info->sectorsize_bits;
+	new_entry->num_roots = 0;
+	ULIST_ITER_INIT(&uiter);
+	while ((node = ulist_next(root_ids, &uiter)) != NULL) {
+		const u64 root_id = node->val;
+		struct clone_root *root;
+
+		root = bsearch((void *)(uintptr_t)root_id, sctx->clone_roots,
+			       sctx->clone_roots_cnt, sizeof(struct clone_root),
+			       __clone_root_cmp_bsearch);
+		if (!root)
+			continue;
+
+		/* Too many roots, just exit, no worries as caching is optional. */
+		if (new_entry->num_roots >= SEND_MAX_BACKREF_CACHE_ROOTS) {
+			kfree(new_entry);
+			return;
+		}
+
+		new_entry->root_ids[new_entry->num_roots] = root_id;
+		new_entry->num_roots++;
+	}
+
+	/*
+	 * We may have not added any roots to the new cache entry, which means
+	 * none of the roots is part of the list of roots from which we are
+	 * allowed to clone. Cache the new entry as it's still useful to avoid
+	 * backref walking to determine which roots have a path to the leaf.
+	 */
+
+	if (sctx->backref_cache.size >= SEND_MAX_BACKREF_CACHE_SIZE) {
+		struct backref_cache_entry *lru_entry;
+		struct backref_cache_entry *mt_entry;
+
+		lru_entry = list_first_entry(&sctx->backref_cache.lru_list,
+					     struct backref_cache_entry, list);
+		mt_entry = mtree_erase(&sctx->backref_cache.entries, lru_entry->key);
+		ASSERT(mt_entry == lru_entry);
+		list_del(&mt_entry->list);
+		kfree(mt_entry);
+		sctx->backref_cache.size--;
+	}
+
+	ret = mtree_insert(&sctx->backref_cache.entries, new_entry->key,
+			   new_entry, GFP_NOFS);
+	ASSERT(ret == 0 || ret == -ENOMEM);
+	if (ret) {
+		/* Caching is optional, no worries. */
+		kfree(new_entry);
+		return;
+	}
+
+	list_add_tail(&new_entry->list, &sctx->backref_cache.lru_list);
+
+	/*
+	 * We are called from iterate_extent_inodes() while either holding a
+	 * transaction handle or holding fs_info->commit_root_sem, so no need
+	 * to take any lock here.
+	 */
+	if (sctx->backref_cache.size == 0)
+		sctx->backref_cache.last_reloc_trans = fs_info->last_reloc_trans;
+
+	sctx->backref_cache.size++;
+}
+
 /*
  * Given an inode, offset and extent item, it finds a good clone for a clone
  * instruction. Returns -ENOENT when none could be found. The function makes
@@ -1465,6 +1642,9 @@ static int find_extent_clone(struct send_ctx *sctx,
 	if (compressed == BTRFS_COMPRESS_NONE)
 		backref_walk_ctx.extent_item_pos = logical - found_key.objectid;
 	backref_walk_ctx.fs_info = fs_info;
+	backref_walk_ctx.cache_lookup = lookup_backref_cache;
+	backref_walk_ctx.cache_store = store_backref_cache;
+	backref_walk_ctx.user_ctx = sctx;
 
 	ret = iterate_extent_inodes(&backref_walk_ctx, true, __iterate_backrefs,
 				    &backref_ctx);
@@ -7891,6 +8071,9 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
 	INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL);
 	INIT_LIST_HEAD(&sctx->name_cache_list);
 
+	INIT_LIST_HEAD(&sctx->backref_cache.lru_list);
+	mt_init(&sctx->backref_cache.entries);
+
 	sctx->flags = arg->flags;
 
 	if (arg->flags & BTRFS_SEND_FLAG_VERSION) {
@@ -8153,6 +8336,8 @@ out:
 
 		close_current_inode(sctx);
 
+		empty_backref_cache(sctx);
+
 		kfree(sctx);
 	}
 
-- 
cgit 


From 88ffb665c894b1929b30b09e05506ff359d9fb89 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 1 Nov 2022 16:15:51 +0000
Subject: btrfs: send: skip unnecessary backref iterations

When looking for a clone source for an extent, we are iterating over all
the backreferences for an extent. This is often a waste of time, because
once we find a good clone source we could stop immediately instead of
continuing backref walking, which is expensive.

Basically what happens currently is this:

1) Call iterate_extent_inodes() to iterate over all the backreferences;

2) It calls btrfs_find_all_leafs() which in turn calls the main function
   to walk over backrefs and collect them - find_parent_nodes();

3) Then we collect all the references for our target data extent from the
   extent tree (and delayed refs if any), add them to the rb trees,
   resolve all the indirect backreferences and search for all the file
   extent items in fs trees, building a list of inodes for each one of
   them (struct extent_inode_elem);

4) Then back at iterate_extent_inodes() we find all the roots associated
   to each found leaf, and call the callback __iterate_backrefs defined
   at send.c for each inode in the inode list associated to each leaf.

Some times one the first backreferences we find in a fs tree is optimal
to satisfy the clone operation that send wants to perform, and in that
case we could stop immediately and avoid resolving all the remaining
indirect backreferences (search fs trees for the respective file extent
items, etc). This possibly if when we find a fs tree leaf with a file
extent item we are able to know what are all the roots that can lead to
the leaf - this is now possible after the previous patch in the series
that adds a cache that maps leaves to a list of roots. So we can now
shortcircuit backref walking during send, by having the callback we
pass to iterate_extent_inodes() to be called when we find a file extent
item for an indirect backreference, and have it return a special value
when it found a suitable backreference and it does not need to look for
more backreferences. This change does that.

This change is part of a patchset comprised of the following patches:

  01/17 btrfs: fix inode list leak during backref walking at resolve_indirect_refs()
  02/17 btrfs: fix inode list leak during backref walking at find_parent_nodes()
  03/17 btrfs: fix ulist leaks in error paths of qgroup self tests
  04/17 btrfs: remove pointless and double ulist frees in error paths of qgroup tests
  05/17 btrfs: send: avoid unnecessary path allocations when finding extent clone
  06/17 btrfs: send: update comment at find_extent_clone()
  07/17 btrfs: send: drop unnecessary backref context field initializations
  08/17 btrfs: send: avoid unnecessary backref lookups when finding clone source
  09/17 btrfs: send: optimize clone detection to increase extent sharing
  10/17 btrfs: use a single argument for extent offset in backref walking functions
  11/17 btrfs: use a structure to pass arguments to backref walking functions
  12/17 btrfs: reuse roots ulist on each leaf iteration for iterate_extent_inodes()
  13/17 btrfs: constify ulist parameter of ulist_next()
  14/17 btrfs: send: cache leaf to roots mapping during backref walking
  15/17 btrfs: send: skip unnecessary backref iterations
  16/17 btrfs: send: avoid double extent tree search when finding clone source
  17/17 btrfs: send: skip resolution of our own backref when finding clone source

Performance test results are in the changelog of patch 17/17.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 73 ++++++++++++++++++++++++++++++++----------------
 fs/btrfs/backref.h | 44 ++++++++++++++++++++++++-----
 fs/btrfs/send.c    | 81 +++++++++++++++++++++++++++---------------------------
 3 files changed, 128 insertions(+), 70 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 9dacf487017d..bb59ba68d7ee 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -31,15 +31,18 @@ struct extent_inode_elem {
 	struct extent_inode_elem *next;
 };
 
-static int check_extent_in_eb(const struct btrfs_key *key,
+static int check_extent_in_eb(struct btrfs_backref_walk_ctx *ctx,
+			      const struct btrfs_key *key,
 			      const struct extent_buffer *eb,
 			      const struct btrfs_file_extent_item *fi,
-			      u64 extent_item_pos,
 			      struct extent_inode_elem **eie)
 {
 	const u64 data_len = btrfs_file_extent_num_bytes(eb, fi);
-	u64 offset = 0;
+	u64 offset = key->offset;
 	struct extent_inode_elem *e;
+	const u64 *root_ids;
+	int root_count;
+	bool cached;
 
 	if (!btrfs_file_extent_compression(eb, fi) &&
 	    !btrfs_file_extent_encryption(eb, fi) &&
@@ -48,19 +51,38 @@ static int check_extent_in_eb(const struct btrfs_key *key,
 
 		data_offset = btrfs_file_extent_offset(eb, fi);
 
-		if (extent_item_pos < data_offset ||
-		    extent_item_pos >= data_offset + data_len)
+		if (ctx->extent_item_pos < data_offset ||
+		    ctx->extent_item_pos >= data_offset + data_len)
 			return 1;
-		offset = extent_item_pos - data_offset;
+		offset += ctx->extent_item_pos - data_offset;
+	}
+
+	if (!ctx->indirect_ref_iterator || !ctx->cache_lookup)
+		goto add_inode_elem;
+
+	cached = ctx->cache_lookup(eb->start, ctx->user_ctx, &root_ids,
+				   &root_count);
+	if (!cached)
+		goto add_inode_elem;
+
+	for (int i = 0; i < root_count; i++) {
+		int ret;
+
+		ret = ctx->indirect_ref_iterator(key->objectid, offset,
+						 data_len, root_ids[i],
+						 ctx->user_ctx);
+		if (ret)
+			return ret;
 	}
 
+add_inode_elem:
 	e = kmalloc(sizeof(*e), GFP_NOFS);
 	if (!e)
 		return -ENOMEM;
 
 	e->next = *eie;
 	e->inum = key->objectid;
-	e->offset = key->offset + offset;
+	e->offset = offset;
 	e->num_bytes = data_len;
 	*eie = e;
 
@@ -77,8 +99,8 @@ static void free_inode_elem_list(struct extent_inode_elem *eie)
 	}
 }
 
-static int find_extent_in_eb(const struct extent_buffer *eb,
-			     u64 wanted_disk_byte, u64 extent_item_pos,
+static int find_extent_in_eb(struct btrfs_backref_walk_ctx *ctx,
+			     const struct extent_buffer *eb,
 			     struct extent_inode_elem **eie)
 {
 	u64 disk_byte;
@@ -105,11 +127,11 @@ static int find_extent_in_eb(const struct extent_buffer *eb,
 			continue;
 		/* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
 		disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
-		if (disk_byte != wanted_disk_byte)
+		if (disk_byte != ctx->bytenr)
 			continue;
 
-		ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie);
-		if (ret < 0)
+		ret = check_extent_in_eb(ctx, &key, eb, fi, eie);
+		if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || ret < 0)
 			return ret;
 	}
 
@@ -526,9 +548,9 @@ static int add_all_parents(struct btrfs_backref_walk_ctx *ctx,
 			else
 				goto next;
 			if (!ctx->ignore_extent_item_pos) {
-				ret = check_extent_in_eb(&key, eb, fi,
-							 ctx->extent_item_pos, &eie);
-				if (ret < 0)
+				ret = check_extent_in_eb(ctx, &key, eb, fi, &eie);
+				if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP ||
+				    ret < 0)
 					break;
 			}
 			if (ret > 0)
@@ -551,10 +573,11 @@ next:
 			ret = btrfs_next_old_item(root, path, ctx->time_seq);
 	}
 
-	if (ret > 0)
-		ret = 0;
-	else if (ret < 0)
+	if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || ret < 0)
 		free_inode_elem_list(eie);
+	else if (ret > 0)
+		ret = 0;
+
 	return ret;
 }
 
@@ -1570,12 +1593,12 @@ again:
 
 				if (!path->skip_locking)
 					btrfs_tree_read_lock(eb);
-				ret = find_extent_in_eb(eb, ctx->bytenr,
-							ctx->extent_item_pos, &eie);
+				ret = find_extent_in_eb(ctx, eb, &eie);
 				if (!path->skip_locking)
 					btrfs_tree_read_unlock(eb);
 				free_extent_buffer(eb);
-				if (ret < 0)
+				if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP ||
+				    ret < 0)
 					goto out;
 				ref->inode_list = eie;
 				/*
@@ -1628,7 +1651,7 @@ out:
 	prelim_release(&preftrees.indirect);
 	prelim_release(&preftrees.indirect_missing_keys);
 
-	if (ret < 0)
+	if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || ret < 0)
 		free_inode_elem_list(eie);
 	return ret;
 }
@@ -1654,7 +1677,8 @@ int btrfs_find_all_leafs(struct btrfs_backref_walk_ctx *ctx)
 		return -ENOMEM;
 
 	ret = find_parent_nodes(ctx, NULL);
-	if (ret < 0 && ret != -ENOENT) {
+	if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP ||
+	    (ret < 0 && ret != -ENOENT)) {
 		free_leaf_list(ctx->refs);
 		ctx->refs = NULL;
 		return ret;
@@ -2402,6 +2426,9 @@ out:
 	ulist_free(ctx->roots);
 	ctx->roots = NULL;
 
+	if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP)
+		ret = 0;
+
 	return ret;
 }
 
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 5005f579f235..a80d1e8be718 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -12,6 +12,25 @@
 #include "disk-io.h"
 #include "extent_io.h"
 
+/*
+ * Used by implementations of iterate_extent_inodes_t (see definition below) to
+ * signal that backref iteration can stop immediately and no error happened.
+ * The value must be non-negative and must not be 0, 1 (which is a common return
+ * value from things like btrfs_search_slot() and used internally in the backref
+ * walking code) and different from BACKREF_FOUND_SHARED and
+ * BACKREF_FOUND_NOT_SHARED
+ */
+#define BTRFS_ITERATE_EXTENT_INODES_STOP 5
+
+/*
+ * Should return 0 if no errors happened and iteration of backrefs should
+ * continue. Can return BTRFS_ITERATE_EXTENT_INODES_STOP or any other non-zero
+ * value to immediately stop iteration and possibly signal an error back to
+ * the caller.
+ */
+typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 num_bytes,
+				      u64 root, void *ctx);
+
 /*
  * Context and arguments for backref walking functions. Some of the fields are
  * to be filled by the caller of such functions while other are filled by the
@@ -70,15 +89,29 @@ struct btrfs_backref_walk_ctx {
 	 */
 	struct ulist *roots;
 	/*
-	 * Used by iterate_extent_inodes(). Lookup and store functions for an
-	 * optional cache which maps the logical address (bytenr) of leaves
-	 * to an array of root IDs.
+	 * Used by iterate_extent_inodes() and the main backref walk code
+	 * (find_parent_nodes()). Lookup and store functions for an optional
+	 * cache which maps the logical address (bytenr) of leaves to an array
+	 * of root IDs.
 	 */
 	bool (*cache_lookup)(u64 leaf_bytenr, void *user_ctx,
 			     const u64 **root_ids_ret, int *root_count_ret);
 	void (*cache_store)(u64 leaf_bytenr, const struct ulist *root_ids,
 			    void *user_ctx);
-	/* Context object to pass to @cache_lookup and @cache_store. */
+	/*
+	 * If this is not NULL, then the backref walking code will call this
+	 * for each indirect data extent reference as soon as it finds one,
+	 * before collecting all the remaining backrefs and before resolving
+	 * indirect backrefs. This allows for the caller to terminate backref
+	 * walking as soon as it finds one backref that matches some specific
+	 * criteria. The @cache_lookup and @cache_store callbacks should not
+	 * be NULL in order to use this callback.
+	 */
+	iterate_extent_inodes_t *indirect_ref_iterator;
+	/*
+	 * Context object to pass to the @cache_lookup, @cache_store and
+	 * @indirect_ref_iterator callbacks.
+	 */
 	void *user_ctx;
 };
 
@@ -145,9 +178,6 @@ struct btrfs_backref_share_check_ctx {
 	int prev_extents_cache_slot;
 };
 
-typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 num_bytes,
-				      u64 root, void *ctx);
-
 struct btrfs_backref_share_check_ctx *btrfs_alloc_backref_share_check_ctx(void);
 void btrfs_free_backref_share_ctx(struct btrfs_backref_share_check_ctx *ctx);
 
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index fa94bd550564..f453f6406564 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -77,7 +77,7 @@ struct clone_root {
 	u64 ino;
 	u64 offset;
 	u64 num_bytes;
-	u64 found_refs;
+	bool found_ref;
 };
 
 #define SEND_CTX_MAX_NAME_CACHE_SIZE 128
@@ -1281,9 +1281,6 @@ struct backref_ctx {
 
 	/* may be truncated in case it's the last extent in a file */
 	u64 extent_len;
-
-	/* Just to check for bugs in backref resolving */
-	int found_itself;
 };
 
 static int __clone_root_cmp_bsearch(const void *key, const void *elt)
@@ -1312,33 +1309,33 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2)
 
 /*
  * Called for every backref that is found for the current extent.
- * Results are collected in sctx->clone_roots->ino/offset/found_refs
+ * Results are collected in sctx->clone_roots->ino/offset.
  */
-static int __iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root,
-			      void *ctx_)
+static int iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root_id,
+			    void *ctx_)
 {
 	struct backref_ctx *bctx = ctx_;
-	struct clone_root *found;
+	struct clone_root *clone_root;
 
 	/* First check if the root is in the list of accepted clone sources */
-	found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots,
-			bctx->sctx->clone_roots_cnt,
-			sizeof(struct clone_root),
-			__clone_root_cmp_bsearch);
-	if (!found)
+	clone_root = bsearch((void *)(uintptr_t)root_id, bctx->sctx->clone_roots,
+			     bctx->sctx->clone_roots_cnt,
+			     sizeof(struct clone_root),
+			     __clone_root_cmp_bsearch);
+	if (!clone_root)
 		return 0;
 
-	if (found->root == bctx->sctx->send_root &&
+	/* This is our own reference, bail out as we can't clone from it. */
+	if (clone_root->root == bctx->sctx->send_root &&
 	    ino == bctx->cur_objectid &&
-	    offset == bctx->cur_offset) {
-		bctx->found_itself = 1;
-	}
+	    offset == bctx->cur_offset)
+		return 0;
 
 	/*
 	 * Make sure we don't consider clones from send_root that are
 	 * behind the current inode/offset.
 	 */
-	if (found->root == bctx->sctx->send_root) {
+	if (clone_root->root == bctx->sctx->send_root) {
 		/*
 		 * If the source inode was not yet processed we can't issue a
 		 * clone operation, as the source extent does not exist yet at
@@ -1359,7 +1356,7 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root,
 	}
 
 	bctx->found++;
-	found->found_refs++;
+	clone_root->found_ref = true;
 
 	/*
 	 * If the given backref refers to a file extent item with a larger
@@ -1367,10 +1364,17 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root,
 	 * we clone more optimally and end up doing less writes and getting
 	 * less exclusive, non-shared extents at the destination.
 	 */
-	if (num_bytes > found->num_bytes) {
-		found->ino = ino;
-		found->offset = offset;
-		found->num_bytes = num_bytes;
+	if (num_bytes > clone_root->num_bytes) {
+		clone_root->ino = ino;
+		clone_root->offset = offset;
+		clone_root->num_bytes = num_bytes;
+
+		/*
+		 * Found a perfect candidate, so there's no need to continue
+		 * backref walking.
+		 */
+		if (num_bytes >= bctx->extent_len)
+			return BTRFS_ITERATE_EXTENT_INODES_STOP;
 	}
 
 	return 0;
@@ -1392,7 +1396,8 @@ static void empty_backref_cache(struct send_ctx *sctx)
 static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx,
 				 const u64 **root_ids_ret, int *root_count_ret)
 {
-	struct send_ctx *sctx = ctx;
+	struct backref_ctx *bctx = ctx;
+	struct send_ctx *sctx = bctx->sctx;
 	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
 	const u64 key = leaf_bytenr >> fs_info->sectorsize_bits;
 	struct backref_cache_entry *entry;
@@ -1430,7 +1435,8 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx,
 static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids,
 				void *ctx)
 {
-	struct send_ctx *sctx = ctx;
+	struct backref_ctx *bctx = ctx;
+	struct send_ctx *sctx = bctx->sctx;
 	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
 	struct backref_cache_entry *new_entry;
 	struct ulist_iterator uiter;
@@ -1618,7 +1624,7 @@ static int find_extent_clone(struct send_ctx *sctx,
 		cur_clone_root->ino = (u64)-1;
 		cur_clone_root->offset = 0;
 		cur_clone_root->num_bytes = 0;
-		cur_clone_root->found_refs = 0;
+		cur_clone_root->found_ref = false;
 	}
 
 	backref_ctx.sctx = sctx;
@@ -1628,7 +1634,7 @@ static int find_extent_clone(struct send_ctx *sctx,
 	/*
 	 * The last extent of a file may be too large due to page alignment.
 	 * We need to adjust extent_len in this case so that the checks in
-	 * __iterate_backrefs work.
+	 * iterate_backrefs() work.
 	 */
 	if (data_offset + num_bytes >= ino_size)
 		backref_ctx.extent_len = ino_size - data_offset;
@@ -1644,9 +1650,10 @@ static int find_extent_clone(struct send_ctx *sctx,
 	backref_walk_ctx.fs_info = fs_info;
 	backref_walk_ctx.cache_lookup = lookup_backref_cache;
 	backref_walk_ctx.cache_store = store_backref_cache;
-	backref_walk_ctx.user_ctx = sctx;
+	backref_walk_ctx.indirect_ref_iterator = iterate_backrefs;
+	backref_walk_ctx.user_ctx = &backref_ctx;
 
-	ret = iterate_extent_inodes(&backref_walk_ctx, true, __iterate_backrefs,
+	ret = iterate_extent_inodes(&backref_walk_ctx, true, iterate_backrefs,
 				    &backref_ctx);
 	if (ret < 0)
 		goto out;
@@ -1671,27 +1678,21 @@ static int find_extent_clone(struct send_ctx *sctx,
 	}
 	up_read(&fs_info->commit_root_sem);
 
-	if (!backref_ctx.found_itself) {
-		/* found a bug in backref code? */
-		ret = -EIO;
-		btrfs_err(fs_info,
-			  "did not find backref in send_root. inode=%llu, offset=%llu, disk_byte=%llu found extent=%llu",
-			  ino, data_offset, disk_byte, found_key.objectid);
-		goto out;
-	}
-
 	btrfs_debug(fs_info,
 		    "find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu",
 		    data_offset, ino, num_bytes, logical);
 
-	if (!backref_ctx.found)
+	if (!backref_ctx.found) {
 		btrfs_debug(fs_info, "no clones found");
+		ret = -ENOENT;
+		goto out;
+	}
 
 	cur_clone_root = NULL;
 	for (i = 0; i < sctx->clone_roots_cnt; i++) {
 		struct clone_root *clone_root = &sctx->clone_roots[i];
 
-		if (clone_root->found_refs == 0)
+		if (!clone_root->found_ref)
 			continue;
 
 		/*
-- 
cgit 


From f73853c7168aef0e071c160979af05a148691e61 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 1 Nov 2022 16:15:52 +0000
Subject: btrfs: send: avoid double extent tree search when finding clone
 source

At find_extent_clone() we search twice for the extent item corresponding
to the data extent that the current file extent items points to:

1) Once with a call to extent_from_logical();

2) Once again during backref walking, through iterate_extent_inodes()
   which eventually leads to find_parent_nodes() where we will search
   again the extent tree for the same extent item.

The extent tree can be huge, so doing this one extra search for every
extent we want to send adds up and it's expensive.

The first call is there since the send code was introduced and it
accomplishes two things:

1) Check that the extent is flagged as a data extent in the extent tree.
   But it can not be anything else, otherwise we wouldn't have a file
   extent item in the send root pointing to it.
   This was probably added to catch bugs in the early days where send was
   yet too young and the interaction with everything else was far from
   perfect;

2) Check how many direct references there are on the extent, and if
   there's too many (more than SEND_MAX_EXTENT_REFS), avoid doing the
   backred walking as it may take too long and slowdown send.

So improve on this by having a callback in the backref walking code that
is called when it finds the extent item in the extent tree, and have those
checks done in the callback. When the callback returns anything different
from 0, it stops the backref walking code. This way we do a single search
on the extent tree for the extent item of our data extent.

Also, before this change we were only checking the number of references on
the data extent against SEND_MAX_EXTENT_REFS, but after starting backref
walking we will end up resolving backrefs for extent buffers in the path
from a leaf having a file extent item pointing to our data extent, up to
roots of trees from which the extent buffer is accessible from, due to
shared subtrees resulting from snapshoting. We were therefore allowing for
the possibility for send taking too long due to some node in the path from
the leaf to a root node being shared too many times. After this change we
check for reference counts being greater than SEND_MAX_EXTENT_REFS for
both data extents and metadata extents.

This change is part of a patchset comprised of the following patches:

  01/17 btrfs: fix inode list leak during backref walking at resolve_indirect_refs()
  02/17 btrfs: fix inode list leak during backref walking at find_parent_nodes()
  03/17 btrfs: fix ulist leaks in error paths of qgroup self tests
  04/17 btrfs: remove pointless and double ulist frees in error paths of qgroup tests
  05/17 btrfs: send: avoid unnecessary path allocations when finding extent clone
  06/17 btrfs: send: update comment at find_extent_clone()
  07/17 btrfs: send: drop unnecessary backref context field initializations
  08/17 btrfs: send: avoid unnecessary backref lookups when finding clone source
  09/17 btrfs: send: optimize clone detection to increase extent sharing
  10/17 btrfs: use a single argument for extent offset in backref walking functions
  11/17 btrfs: use a structure to pass arguments to backref walking functions
  12/17 btrfs: reuse roots ulist on each leaf iteration for iterate_extent_inodes()
  13/17 btrfs: constify ulist parameter of ulist_next()
  14/17 btrfs: send: cache leaf to roots mapping during backref walking
  15/17 btrfs: send: skip unnecessary backref iterations
  16/17 btrfs: send: avoid double extent tree search when finding clone source
  17/17 btrfs: send: skip resolution of our own backref when finding clone source

Performance test results are in the changelog of patch 17/17.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c |  31 +++++++++-------
 fs/btrfs/backref.h |   9 +++--
 fs/btrfs/send.c    | 103 ++++++++++++++++++++++++-----------------------------
 3 files changed, 73 insertions(+), 70 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index bb59ba68d7ee..33056c4c0528 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1003,8 +1003,8 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
  *
  * Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED.
  */
-static int add_inline_refs(const struct btrfs_fs_info *fs_info,
-			   struct btrfs_path *path, u64 bytenr,
+static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx,
+			   struct btrfs_path *path,
 			   int *info_level, struct preftrees *preftrees,
 			   struct share_check *sc)
 {
@@ -1029,6 +1029,13 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
 	BUG_ON(item_size < sizeof(*ei));
 
 	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+
+	if (ctx->check_extent_item) {
+		ret = ctx->check_extent_item(ctx->bytenr, ei, leaf, ctx->user_ctx);
+		if (ret)
+			return ret;
+	}
+
 	flags = btrfs_extent_flags(leaf, ei);
 	btrfs_item_key_to_cpu(leaf, &found_key, slot);
 
@@ -1064,9 +1071,9 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
 
 		switch (type) {
 		case BTRFS_SHARED_BLOCK_REF_KEY:
-			ret = add_direct_ref(fs_info, preftrees,
+			ret = add_direct_ref(ctx->fs_info, preftrees,
 					     *info_level + 1, offset,
-					     bytenr, 1, NULL, GFP_NOFS);
+					     ctx->bytenr, 1, NULL, GFP_NOFS);
 			break;
 		case BTRFS_SHARED_DATA_REF_KEY: {
 			struct btrfs_shared_data_ref *sdref;
@@ -1075,14 +1082,14 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
 			sdref = (struct btrfs_shared_data_ref *)(iref + 1);
 			count = btrfs_shared_data_ref_count(leaf, sdref);
 
-			ret = add_direct_ref(fs_info, preftrees, 0, offset,
-					     bytenr, count, sc, GFP_NOFS);
+			ret = add_direct_ref(ctx->fs_info, preftrees, 0, offset,
+					     ctx->bytenr, count, sc, GFP_NOFS);
 			break;
 		}
 		case BTRFS_TREE_BLOCK_REF_KEY:
-			ret = add_indirect_ref(fs_info, preftrees, offset,
+			ret = add_indirect_ref(ctx->fs_info, preftrees, offset,
 					       NULL, *info_level + 1,
-					       bytenr, 1, NULL, GFP_NOFS);
+					       ctx->bytenr, 1, NULL, GFP_NOFS);
 			break;
 		case BTRFS_EXTENT_DATA_REF_KEY: {
 			struct btrfs_extent_data_ref *dref;
@@ -1104,8 +1111,8 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
 
 			root = btrfs_extent_data_ref_root(leaf, dref);
 
-			ret = add_indirect_ref(fs_info, preftrees, root,
-					       &key, 0, bytenr, count,
+			ret = add_indirect_ref(ctx->fs_info, preftrees, root,
+					       &key, 0, ctx->bytenr, count,
 					       sc, GFP_NOFS);
 
 			break;
@@ -1455,8 +1462,8 @@ again:
 		if (key.objectid == ctx->bytenr &&
 		    (key.type == BTRFS_EXTENT_ITEM_KEY ||
 		     key.type == BTRFS_METADATA_ITEM_KEY)) {
-			ret = add_inline_refs(ctx->fs_info, path, ctx->bytenr,
-					      &info_level, &preftrees, sc);
+			ret = add_inline_refs(ctx, path, &info_level,
+					      &preftrees, sc);
 			if (ret)
 				goto out;
 			ret = add_keyed_refs(root, path, ctx->bytenr, info_level,
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index a80d1e8be718..1bd5a15c7f9e 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -109,9 +109,14 @@ struct btrfs_backref_walk_ctx {
 	 */
 	iterate_extent_inodes_t *indirect_ref_iterator;
 	/*
-	 * Context object to pass to the @cache_lookup, @cache_store and
-	 * @indirect_ref_iterator callbacks.
+	 * If this is not NULL, then the backref walking code will call this for
+	 * each extent item it's meant to process before it actually starts
+	 * processing it. If this returns anything other than 0, then it stops
+	 * the backref walking code immediately.
 	 */
+	int (*check_extent_item)(u64 bytenr, const struct btrfs_extent_item *ei,
+				 const struct extent_buffer *leaf, void *user_ctx);
+	/* Context object to pass to the callbacks defined above. */
 	void *user_ctx;
 };
 
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index f453f6406564..516b80637bfb 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1281,6 +1281,9 @@ struct backref_ctx {
 
 	/* may be truncated in case it's the last extent in a file */
 	u64 extent_len;
+
+	/* The bytenr the file extent item we are processing refers to. */
+	u64 bytenr;
 };
 
 static int __clone_root_cmp_bsearch(const void *key, const void *elt)
@@ -1518,6 +1521,43 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids,
 	sctx->backref_cache.size++;
 }
 
+static int check_extent_item(u64 bytenr, const struct btrfs_extent_item *ei,
+			     const struct extent_buffer *leaf, void *ctx)
+{
+	const u64 refs = btrfs_extent_refs(leaf, ei);
+	const struct backref_ctx *bctx = ctx;
+	const struct send_ctx *sctx = bctx->sctx;
+
+	if (bytenr == bctx->bytenr) {
+		const u64 flags = btrfs_extent_flags(leaf, ei);
+
+		if (WARN_ON(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
+			return -EUCLEAN;
+
+		/*
+		 * If we have only one reference and only the send root as a
+		 * clone source - meaning no clone roots were given in the
+		 * struct btrfs_ioctl_send_args passed to the send ioctl - then
+		 * it's our reference and there's no point in doing backref
+		 * walking which is expensive, so exit early.
+		 */
+		if (refs == 1 && sctx->clone_roots_cnt == 1)
+			return -ENOENT;
+	}
+
+	/*
+	 * Backreference walking (iterate_extent_inodes() below) is currently
+	 * too expensive when an extent has a large number of references, both
+	 * in time spent and used memory. So for now just fallback to write
+	 * operations instead of clone operations when an extent has more than
+	 * a certain amount of references.
+	 */
+	if (refs > SEND_MAX_EXTENT_REFS)
+		return -ENOENT;
+
+	return 0;
+}
+
 /*
  * Given an inode, offset and extent item, it finds a good clone for a clone
  * instruction. Returns -ENOENT when none could be found. The function makes
@@ -1539,16 +1579,11 @@ static int find_extent_clone(struct send_ctx *sctx,
 	u64 logical;
 	u64 disk_byte;
 	u64 num_bytes;
-	u64 extent_refs;
-	u64 flags = 0;
 	struct btrfs_file_extent_item *fi;
 	struct extent_buffer *eb = path->nodes[0];
 	struct backref_ctx backref_ctx = { 0 };
 	struct btrfs_backref_walk_ctx backref_walk_ctx = { 0 };
 	struct clone_root *cur_clone_root;
-	struct btrfs_key found_key;
-	struct btrfs_path *tmp_path;
-	struct btrfs_extent_item *ei;
 	int compressed;
 	u32 i;
 
@@ -1574,48 +1609,6 @@ static int find_extent_clone(struct send_ctx *sctx,
 	num_bytes = btrfs_file_extent_num_bytes(eb, fi);
 	logical = disk_byte + btrfs_file_extent_offset(eb, fi);
 
-	tmp_path = alloc_path_for_send();
-	if (!tmp_path)
-		return -ENOMEM;
-
-	/* We only use this path under the commit sem */
-	tmp_path->need_commit_sem = 0;
-
-	down_read(&fs_info->commit_root_sem);
-	ret = extent_from_logical(fs_info, disk_byte, tmp_path,
-				  &found_key, &flags);
-	up_read(&fs_info->commit_root_sem);
-
-	if (ret < 0)
-		goto out;
-	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
-		ret = -EIO;
-		goto out;
-	}
-
-	ei = btrfs_item_ptr(tmp_path->nodes[0], tmp_path->slots[0],
-			    struct btrfs_extent_item);
-	extent_refs = btrfs_extent_refs(tmp_path->nodes[0], ei);
-	/*
-	 * Backreference walking (iterate_extent_inodes() below) is currently
-	 * too expensive when an extent has a large number of references, both
-	 * in time spent and used memory. So for now just fallback to write
-	 * operations instead of clone operations when an extent has more than
-	 * a certain amount of references.
-	 *
-	 * Also, if we have only one reference and only the send root as a clone
-	 * source - meaning no clone roots were given in the struct
-	 * btrfs_ioctl_send_args passed to the send ioctl - then it's our
-	 * reference and there's no point in doing backref walking which is
-	 * expensive, so exit early.
-	 */
-	if ((extent_refs == 1 && sctx->clone_roots_cnt == 1) ||
-	    extent_refs > SEND_MAX_EXTENT_REFS) {
-		ret = -ENOENT;
-		goto out;
-	}
-	btrfs_release_path(tmp_path);
-
 	/*
 	 * Setup the clone roots.
 	 */
@@ -1630,6 +1623,7 @@ static int find_extent_clone(struct send_ctx *sctx,
 	backref_ctx.sctx = sctx;
 	backref_ctx.cur_objectid = ino;
 	backref_ctx.cur_offset = data_offset;
+	backref_ctx.bytenr = disk_byte;
 
 	/*
 	 * The last extent of a file may be too large due to page alignment.
@@ -1644,19 +1638,20 @@ static int find_extent_clone(struct send_ctx *sctx,
 	/*
 	 * Now collect all backrefs.
 	 */
-	backref_walk_ctx.bytenr = found_key.objectid;
+	backref_walk_ctx.bytenr = disk_byte;
 	if (compressed == BTRFS_COMPRESS_NONE)
-		backref_walk_ctx.extent_item_pos = logical - found_key.objectid;
+		backref_walk_ctx.extent_item_pos = btrfs_file_extent_offset(eb, fi);
 	backref_walk_ctx.fs_info = fs_info;
 	backref_walk_ctx.cache_lookup = lookup_backref_cache;
 	backref_walk_ctx.cache_store = store_backref_cache;
 	backref_walk_ctx.indirect_ref_iterator = iterate_backrefs;
+	backref_walk_ctx.check_extent_item = check_extent_item;
 	backref_walk_ctx.user_ctx = &backref_ctx;
 
 	ret = iterate_extent_inodes(&backref_walk_ctx, true, iterate_backrefs,
 				    &backref_ctx);
 	if (ret < 0)
-		goto out;
+		return ret;
 
 	down_read(&fs_info->commit_root_sem);
 	if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
@@ -1673,8 +1668,7 @@ static int find_extent_clone(struct send_ctx *sctx,
 		 * was already reallocated after the relocation.
 		 */
 		up_read(&fs_info->commit_root_sem);
-		ret = -ENOENT;
-		goto out;
+		return -ENOENT;
 	}
 	up_read(&fs_info->commit_root_sem);
 
@@ -1684,8 +1678,7 @@ static int find_extent_clone(struct send_ctx *sctx,
 
 	if (!backref_ctx.found) {
 		btrfs_debug(fs_info, "no clones found");
-		ret = -ENOENT;
-		goto out;
+		return -ENOENT;
 	}
 
 	cur_clone_root = NULL;
@@ -1720,8 +1713,6 @@ static int find_extent_clone(struct send_ctx *sctx,
 		ret = -ENOENT;
 	}
 
-out:
-	btrfs_free_path(tmp_path);
 	return ret;
 }
 
-- 
cgit 


From adf0241868bd46c7be8012ef99cb88c7f47c16ce Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 1 Nov 2022 16:15:53 +0000
Subject: btrfs: send: skip resolution of our own backref when finding clone
 source

When doing backref walking to determine a source range to clone from, it
is worthless to collect and resolve our own data backref, as we can't
obviously use it as a clone source and it represents the range we want to
clone into. Collecting the backref implies doing the extra work to resolve
it, doing the search for a file extent item in a subvolume tree, etc.
Skipping the data backref is valid as long as we only have the send root
as the single clone root, otherwise the leaf with the file extent item may
be accessible from another clone root due to shared subtrees created by
snapshots, and therefore we have to collect the backref and resolve it.

So add a callback to the backref walking code to guide it to skip data
backrefs.

This change is part of a patchset comprised of the following patches:

  01/17 btrfs: fix inode list leak during backref walking at resolve_indirect_refs()
  02/17 btrfs: fix inode list leak during backref walking at find_parent_nodes()
  03/17 btrfs: fix ulist leaks in error paths of qgroup self tests
  04/17 btrfs: remove pointless and double ulist frees in error paths of qgroup tests
  05/17 btrfs: send: avoid unnecessary path allocations when finding extent clone
  06/17 btrfs: send: update comment at find_extent_clone()
  07/17 btrfs: send: drop unnecessary backref context field initializations
  08/17 btrfs: send: avoid unnecessary backref lookups when finding clone source
  09/17 btrfs: send: optimize clone detection to increase extent sharing
  10/17 btrfs: use a single argument for extent offset in backref walking functions
  11/17 btrfs: use a structure to pass arguments to backref walking functions
  12/17 btrfs: reuse roots ulist on each leaf iteration for iterate_extent_inodes()
  13/17 btrfs: constify ulist parameter of ulist_next()
  14/17 btrfs: send: cache leaf to roots mapping during backref walking
  15/17 btrfs: send: skip unnecessary backref iterations
  16/17 btrfs: send: avoid double extent tree search when finding clone source
  17/17 btrfs: send: skip resolution of our own backref when finding clone source

The following test was run on non-debug kernel (Debian's default kernel
config) before and after applying the patchset:

   $ cat test-send-many-shared-extents.sh
   #!/bin/bash

   DEV=/dev/sdh
   MNT=/mnt/sdh

   umount $DEV &> /dev/null
   mkfs.btrfs -f $DEV
   mount $DEV $MNT

   num_files=50000
   num_clones_per_file=50

   for ((i = 1; i <= $num_files; i++)); do
       xfs_io -f -c "pwrite 0 64K" $MNT/file_$i > /dev/null
       echo -ne "\r$i files created..."
   done
   echo

   btrfs subvolume snapshot -r $MNT $MNT/snap1

   cloned=0
   for ((i = 1; i <= $num_clones_per_file; i++)); do
       for ((j = 1; j <= $num_files; j++)); do
           cp --reflink=always $MNT/file_$j $MNT/file_${j}_clone_${i}
           cloned=$((cloned + 1))
           echo -ne "\r$cloned / $((num_files * num_clones_per_file)) clone operations"
       done
   done
   echo

   btrfs subvolume snapshot -r $MNT $MNT/snap2

   # Unmount and mount again to clear all cached metadata (and data).
   umount $DEV
   mount $DEV $MNT

   start=$(date +%s%N)
   btrfs send $MNT/snap2 > /dev/null
   end=$(date +%s%N)

   dur=$(( (end - start) / 1000000000 ))
   echo -e "\nFull send took $dur seconds"

   # Unmount and mount again to clear all cached metadata (and data).
   umount $DEV
   mount $DEV $MNT

   start=$(date +%s%N)
   btrfs send -p $MNT/snap1 $MNT/snap2 > /dev/null
   end=$(date +%s%N)

   dur=$(( (end - start) / 1000000000 ))
   echo -e "\nIncremental send took $dur seconds"

   umount $MNT

Before applying the patchset:

   (...)
   Full send took 1108 seconds
   (...)
   Incremental send took 1135 seconds

After applying the whole patchset:

   (...)
   Full send took 268 seconds            (-75.8%)
   (...)
   Incremental send took 316 seconds     (-72.2%)

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 35 +++++++++++++++++++++--------------
 fs/btrfs/backref.h |  9 +++++++++
 fs/btrfs/send.c    | 33 +++++++++++++++++++++++++++++++++
 3 files changed, 63 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 33056c4c0528..430974cf3b96 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1111,10 +1111,12 @@ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx,
 
 			root = btrfs_extent_data_ref_root(leaf, dref);
 
-			ret = add_indirect_ref(ctx->fs_info, preftrees, root,
-					       &key, 0, ctx->bytenr, count,
-					       sc, GFP_NOFS);
-
+			if (!ctx->skip_data_ref ||
+			    !ctx->skip_data_ref(root, key.objectid, key.offset,
+						ctx->user_ctx))
+				ret = add_indirect_ref(ctx->fs_info, preftrees,
+						       root, &key, 0, ctx->bytenr,
+						       count, sc, GFP_NOFS);
 			break;
 		}
 		default:
@@ -1133,8 +1135,9 @@ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx,
  *
  * Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED.
  */
-static int add_keyed_refs(struct btrfs_root *extent_root,
-			  struct btrfs_path *path, u64 bytenr,
+static int add_keyed_refs(struct btrfs_backref_walk_ctx *ctx,
+			  struct btrfs_root *extent_root,
+			  struct btrfs_path *path,
 			  int info_level, struct preftrees *preftrees,
 			  struct share_check *sc)
 {
@@ -1157,7 +1160,7 @@ static int add_keyed_refs(struct btrfs_root *extent_root,
 		leaf = path->nodes[0];
 		btrfs_item_key_to_cpu(leaf, &key, slot);
 
-		if (key.objectid != bytenr)
+		if (key.objectid != ctx->bytenr)
 			break;
 		if (key.type < BTRFS_TREE_BLOCK_REF_KEY)
 			continue;
@@ -1169,7 +1172,7 @@ static int add_keyed_refs(struct btrfs_root *extent_root,
 			/* SHARED DIRECT METADATA backref */
 			ret = add_direct_ref(fs_info, preftrees,
 					     info_level + 1, key.offset,
-					     bytenr, 1, NULL, GFP_NOFS);
+					     ctx->bytenr, 1, NULL, GFP_NOFS);
 			break;
 		case BTRFS_SHARED_DATA_REF_KEY: {
 			/* SHARED DIRECT FULL backref */
@@ -1180,14 +1183,14 @@ static int add_keyed_refs(struct btrfs_root *extent_root,
 					      struct btrfs_shared_data_ref);
 			count = btrfs_shared_data_ref_count(leaf, sdref);
 			ret = add_direct_ref(fs_info, preftrees, 0,
-					     key.offset, bytenr, count,
+					     key.offset, ctx->bytenr, count,
 					     sc, GFP_NOFS);
 			break;
 		}
 		case BTRFS_TREE_BLOCK_REF_KEY:
 			/* NORMAL INDIRECT METADATA backref */
 			ret = add_indirect_ref(fs_info, preftrees, key.offset,
-					       NULL, info_level + 1, bytenr,
+					       NULL, info_level + 1, ctx->bytenr,
 					       1, NULL, GFP_NOFS);
 			break;
 		case BTRFS_EXTENT_DATA_REF_KEY: {
@@ -1211,9 +1214,13 @@ static int add_keyed_refs(struct btrfs_root *extent_root,
 			}
 
 			root = btrfs_extent_data_ref_root(leaf, dref);
-			ret = add_indirect_ref(fs_info, preftrees, root,
-					       &key, 0, bytenr, count,
-					       sc, GFP_NOFS);
+
+			if (!ctx->skip_data_ref ||
+			    !ctx->skip_data_ref(root, key.objectid, key.offset,
+						ctx->user_ctx))
+				ret = add_indirect_ref(fs_info, preftrees, root,
+						       &key, 0, ctx->bytenr,
+						       count, sc, GFP_NOFS);
 			break;
 		}
 		default:
@@ -1466,7 +1473,7 @@ again:
 					      &preftrees, sc);
 			if (ret)
 				goto out;
-			ret = add_keyed_refs(root, path, ctx->bytenr, info_level,
+			ret = add_keyed_refs(ctx, root, path, info_level,
 					     &preftrees, sc);
 			if (ret)
 				goto out;
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 1bd5a15c7f9e..ef6bbea3f456 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -116,6 +116,15 @@ struct btrfs_backref_walk_ctx {
 	 */
 	int (*check_extent_item)(u64 bytenr, const struct btrfs_extent_item *ei,
 				 const struct extent_buffer *leaf, void *user_ctx);
+	/*
+	 * If this is not NULL, then the backref walking code will call this for
+	 * each extent data ref it finds (BTRFS_EXTENT_DATA_REF_KEY keys) before
+	 * processing that data ref. If this callback return false, then it will
+	 * ignore this data ref and it will never resolve the indirect data ref,
+	 * saving time searching for leaves in a fs tree with file extent items
+	 * matching the data ref.
+	 */
+	bool (*skip_data_ref)(u64 root, u64 ino, u64 offset, void *user_ctx);
 	/* Context object to pass to the callbacks defined above. */
 	void *user_ctx;
 };
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 516b80637bfb..383bc8a5cb6c 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1284,6 +1284,10 @@ struct backref_ctx {
 
 	/* The bytenr the file extent item we are processing refers to. */
 	u64 bytenr;
+	/* The owner (root id) of the data backref for the current extent. */
+	u64 backref_owner;
+	/* The offset of the data backref for the current extent. */
+	u64 backref_offset;
 };
 
 static int __clone_root_cmp_bsearch(const void *key, const void *elt)
@@ -1558,6 +1562,18 @@ static int check_extent_item(u64 bytenr, const struct btrfs_extent_item *ei,
 	return 0;
 }
 
+static bool skip_self_data_ref(u64 root, u64 ino, u64 offset, void *ctx)
+{
+	const struct backref_ctx *bctx = ctx;
+
+	if (ino == bctx->cur_objectid &&
+	    root == bctx->backref_owner &&
+	    offset == bctx->backref_offset)
+		return true;
+
+	return false;
+}
+
 /*
  * Given an inode, offset and extent item, it finds a good clone for a clone
  * instruction. Returns -ENOENT when none could be found. The function makes
@@ -1624,6 +1640,12 @@ static int find_extent_clone(struct send_ctx *sctx,
 	backref_ctx.cur_objectid = ino;
 	backref_ctx.cur_offset = data_offset;
 	backref_ctx.bytenr = disk_byte;
+	/*
+	 * Use the header owner and not the send root's id, because in case of a
+	 * snapshot we can have shared subtrees.
+	 */
+	backref_ctx.backref_owner = btrfs_header_owner(eb);
+	backref_ctx.backref_offset = data_offset - btrfs_file_extent_offset(eb, fi);
 
 	/*
 	 * The last extent of a file may be too large due to page alignment.
@@ -1648,6 +1670,17 @@ static int find_extent_clone(struct send_ctx *sctx,
 	backref_walk_ctx.check_extent_item = check_extent_item;
 	backref_walk_ctx.user_ctx = &backref_ctx;
 
+	/*
+	 * If have a single clone root, then it's the send root and we can tell
+	 * the backref walking code to skip our own backref and not resolve it,
+	 * since we can not use it for cloning - the source and destination
+	 * ranges can't overlap and in case the leaf is shared through a subtree
+	 * due to snapshots, we can't use those other roots since they are not
+	 * in the list of clone roots.
+	 */
+	if (sctx->clone_roots_cnt == 1)
+		backref_walk_ctx.skip_data_ref = skip_self_data_ref;
+
 	ret = iterate_extent_inodes(&backref_walk_ctx, true, iterate_backrefs,
 				    &backref_ctx);
 	if (ret < 0)
-- 
cgit 


From e2a041657774102b184cc41369a53be3d024e8ee Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 1 Nov 2022 16:15:54 +0000
Subject: btrfs: send: bump the extent reference count limit for backref
 walking

After the previous patchset which is comprised of the following patches:

  01/17 btrfs: fix inode list leak during backref walking at resolve_indirect_refs()
  02/17 btrfs: fix inode list leak during backref walking at find_parent_nodes()
  03/17 btrfs: fix ulist leaks in error paths of qgroup self tests
  04/17 btrfs: remove pointless and double ulist frees in error paths of qgroup tests
  05/17 btrfs: send: avoid unnecessary path allocations when finding extent clone
  06/17 btrfs: send: update comment at find_extent_clone()
  07/17 btrfs: send: drop unnecessary backref context field initializations
  08/17 btrfs: send: avoid unnecessary backref lookups when finding clone source
  09/17 btrfs: send: optimize clone detection to increase extent sharing
  10/17 btrfs: use a single argument for extent offset in backref walking functions
  11/17 btrfs: use a structure to pass arguments to backref walking functions
  12/17 btrfs: reuse roots ulist on each leaf iteration for iterate_extent_inodes()
  13/17 btrfs: constify ulist parameter of ulist_next()
  14/17 btrfs: send: cache leaf to roots mapping during backref walking
  15/17 btrfs: send: skip unnecessary backref iterations
  16/17 btrfs: send: avoid double extent tree search when finding clone source
  17/17 btrfs: send: skip resolution of our own backref when finding clone source

we have now much better performance when doing backref walking in the send
code, so we can increase the current limit from 64 to 1024 references.
This limit is still a bit conservative because there are still edge cases
where backref walking will be too slow and spend a lot of cpu time, some IO
reading b+tree nodes/leaves and memory. The goal is to eventually get rid
of any limit, but for now bump it as it benefits users with extents shared
more than 64 times and up to 1024 times, allowing for more deduplication
at the destination without having to run a dedupe tool after a receive.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 383bc8a5cb6c..67f7c698ade3 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -39,7 +39,7 @@
  * avoid hitting limitations of the backreference walking code (taking a lot of
  * time and using too much memory for extents with large number of references).
  */
-#define SEND_MAX_EXTENT_REFS	64
+#define SEND_MAX_EXTENT_REFS	1024
 
 /*
  * A fs_path is a helper to dynamically build path names with unknown size.
-- 
cgit 


From 9e5e6d4e2e5e37e28bf2be85fa08761e33aa5efb Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 31 Oct 2022 20:33:40 +0100
Subject: btrfs: zlib: use copy_page for full page copy

The copy_page helper may use an optimized version for full page copy
(eg. on s390 there's a special instruction for that), there's one more
left to convert.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/zlib.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index b4f44662cda7..c5275cb23837 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -155,8 +155,8 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
 					in_page = find_get_page(mapping,
 								start >> PAGE_SHIFT);
 					data_in = kmap_local_page(in_page);
-					memcpy(workspace->buf + i * PAGE_SIZE,
-					       data_in, PAGE_SIZE);
+					copy_page(workspace->buf + i * PAGE_SIZE,
+						  data_in);
 					start += PAGE_SIZE;
 				}
 				workspace->strm.next_in = workspace->buf;
-- 
cgit 


From fd463ac4616e69c7306c680dabfe857a2f50fc69 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 31 Oct 2022 20:33:42 +0100
Subject: btrfs: zoned: use helper to check a power of two zone size

We have a 64bit compatible helper to check if a value is a power of two,
use it instead of open coding it.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/zoned.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 13034c3bbb65..95dec93374ea 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -394,8 +394,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
 		zone_sectors = bdev_zone_sectors(bdev);
 	}
 
-	/* Check if it's power of 2 (see is_power_of_2) */
-	ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0);
+	ASSERT(is_power_of_two_u64(zone_sectors));
 	zone_info->zone_size = zone_sectors << SECTOR_SHIFT;
 
 	/* We reject devices with a zone size larger than 8GB */
-- 
cgit 


From 0d7764ff58b4b45c39eb03f2c74a819c1a88fa7b Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 31 Oct 2022 20:33:44 +0100
Subject: btrfs: convert btrfs_block_group::needs_free_space to runtime flag

We already have flags in block group to track various status bits,
convert needs_free_space as well and reduce size of btrfs_block_group.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c                 |  2 +-
 fs/btrfs/block-group.h                 |  8 ++------
 fs/btrfs/free-space-tree.c             | 10 +++++-----
 fs/btrfs/tests/free-space-tree-tests.c |  2 +-
 4 files changed, 9 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index ea5fcb665daa..708d843daa72 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -2553,7 +2553,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
 	cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
 
 	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
-		cache->needs_free_space = 1;
+		set_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &cache->runtime_flags);
 
 	ret = btrfs_load_block_group_zone_info(cache, true);
 	if (ret) {
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 6c970a486b68..4cee23c11938 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -55,6 +55,8 @@ enum btrfs_block_group_flags {
 	BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED,
 	BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
 	BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
+	/* Does the block group need to be added to the free space tree? */
+	BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,
 };
 
 enum btrfs_caching_type {
@@ -208,12 +210,6 @@ struct btrfs_block_group {
 	/* Lock for free space tree operations. */
 	struct mutex free_space_lock;
 
-	/*
-	 * Does the block group need to be added to the free space tree?
-	 * Protected by free_space_lock.
-	 */
-	int needs_free_space;
-
 	/* Flag indicating this block group is placed on a sequential zone */
 	bool seq_zone;
 
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 869d062d6765..c667e878ef1a 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -808,7 +808,7 @@ int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
 	u32 flags;
 	int ret;
 
-	if (block_group->needs_free_space) {
+	if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) {
 		ret = __add_block_group_free_space(trans, block_group, path);
 		if (ret)
 			return ret;
@@ -1001,7 +1001,7 @@ int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
 	u32 flags;
 	int ret;
 
-	if (block_group->needs_free_space) {
+	if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) {
 		ret = __add_block_group_free_space(trans, block_group, path);
 		if (ret)
 			return ret;
@@ -1304,7 +1304,7 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
 {
 	int ret;
 
-	block_group->needs_free_space = 0;
+	clear_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags);
 
 	ret = add_new_free_space_info(trans, block_group, path);
 	if (ret)
@@ -1326,7 +1326,7 @@ int add_block_group_free_space(struct btrfs_trans_handle *trans,
 		return 0;
 
 	mutex_lock(&block_group->free_space_lock);
-	if (!block_group->needs_free_space)
+	if (!test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags))
 		goto out;
 
 	path = btrfs_alloc_path();
@@ -1359,7 +1359,7 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans,
 	if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE))
 		return 0;
 
-	if (block_group->needs_free_space) {
+	if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) {
 		/* We never added this block group to the free space tree. */
 		return 0;
 	}
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index 53a17b1d1744..b61972046feb 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -471,7 +471,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
 	}
 	cache->bitmap_low_thresh = 0;
 	cache->bitmap_high_thresh = (u32)-1;
-	cache->needs_free_space = 1;
+	set_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &cache->runtime_flags);
 	cache->fs_info = root->fs_info;
 
 	btrfs_init_dummy_trans(&trans, root->fs_info);
-- 
cgit 


From 961f5b8bf48a463ac5fe5b13143426d79eb41817 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 31 Oct 2022 20:33:46 +0100
Subject: btrfs: convert btrfs_block_group::seq_zone to runtime flag

In zoned mode the sequential status of zone can be also tracked in the
runtime flags of block group.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.h | 5 ++---
 fs/btrfs/zoned.c       | 7 ++++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 4cee23c11938..a02ea76fd6cf 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -57,6 +57,8 @@ enum btrfs_block_group_flags {
 	BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
 	/* Does the block group need to be added to the free space tree? */
 	BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,
+	/* Indicate that the block group is placed on a sequential zone */
+	BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE,
 };
 
 enum btrfs_caching_type {
@@ -210,9 +212,6 @@ struct btrfs_block_group {
 	/* Lock for free space tree operations. */
 	struct mutex free_space_lock;
 
-	/* Flag indicating this block group is placed on a sequential zone */
-	bool seq_zone;
-
 	/*
 	 * Number of extents in this block group used for swap files.
 	 * All accesses protected by the spinlock 'lock'.
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 95dec93374ea..a759668477bb 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1437,7 +1437,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
 	}
 
 	if (num_sequential > 0)
-		cache->seq_zone = true;
+		set_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags);
 
 	if (num_conventional > 0) {
 		/* Zone capacity is always zone size in emulation */
@@ -1649,7 +1649,7 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
 	if (!cache)
 		return false;
 
-	ret = cache->seq_zone;
+	ret = !!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags);
 	btrfs_put_block_group(cache);
 
 	return ret;
@@ -2154,7 +2154,8 @@ static void btrfs_zone_finish_endio_workfn(struct work_struct *work)
 void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
 				   struct extent_buffer *eb)
 {
-	if (!bg->seq_zone || eb->start + eb->len * 2 <= bg->start + bg->zone_capacity)
+	if (!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &bg->runtime_flags) ||
+	    eb->start + eb->len * 2 <= bg->start + bg->zone_capacity)
 		return;
 
 	if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) {
-- 
cgit 


From 19af6a7d345acc885f970d57577fa80ca4ad3d98 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 02:02:32 +0200
Subject: btrfs: change how repair action is passed to btrfs_repair_one_sector

There's a function pointer passed to btrfs_repair_one_sector that will
submit the right bio for repair. However there are only two callbacks,
for buffered and for direct IO. This can be simplified to a bool-based
switch and call either function, indirect calls in this case is an
unnecessary abstraction. This allows to remove the submit_bio_hook_t
typedef.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h |  3 +++
 fs/btrfs/compression.c |  2 +-
 fs/btrfs/extent_io.c   | 14 +++++++++-----
 fs/btrfs/extent_io.h   |  6 +-----
 fs/btrfs/inode.c       |  9 ++++-----
 5 files changed, 18 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index d21c30bf7053..fa0c72cabd8f 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -414,6 +414,9 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags,
 void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num);
 void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
 			int mirror_num, enum btrfs_compression_type compress_type);
+void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio,
+				 int mirror_num,
+				 enum btrfs_compression_type compress_type);
 int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
 			    u32 pgoff, u8 *csum, const u8 * const csum_expected);
 int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index e66fa18dbcf7..cf3dc7e501ec 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -196,7 +196,7 @@ static void end_compressed_bio_read(struct btrfs_bio *bbio)
 			refcount_inc(&cb->pending_ios);
 			ret = btrfs_repair_one_sector(inode, bbio, offset,
 						      bv.bv_page, bv.bv_offset,
-						      btrfs_submit_data_read_bio);
+						      true);
 			if (ret) {
 				refcount_dec(&cb->pending_ios);
 				status = errno_to_blk_status(ret);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 2ec989b83f54..44ff41304247 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -797,7 +797,7 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
 
 int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
 			    u32 bio_offset, struct page *page, unsigned int pgoff,
-			    submit_bio_hook_t *submit_bio_hook)
+			    bool submit_buffered)
 {
 	u64 start = failed_bbio->file_offset + bio_offset;
 	struct io_failure_record *failrec;
@@ -856,11 +856,15 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
 		    failrec->this_mirror);
 
 	/*
-	 * At this point we have a bio, so any errors from submit_bio_hook()
-	 * will be handled by the endio on the repair_bio, so we can't return an
+	 * At this point we have a bio, so any errors from bio submission will
+	 * be handled by the endio on the repair_bio, so we can't return an
 	 * error here.
 	 */
-	submit_bio_hook(inode, repair_bio, failrec->this_mirror, 0);
+	if (submit_buffered)
+		btrfs_submit_data_read_bio(inode, repair_bio, failrec->this_mirror, 0);
+	else
+		btrfs_submit_dio_repair_bio(inode, repair_bio, failrec->this_mirror, 0);
+
 	return BLK_STS_OK;
 }
 
@@ -951,7 +955,7 @@ static void submit_data_read_repair(struct inode *inode,
 
 		ret = btrfs_repair_one_sector(inode, failed_bbio,
 				bio_offset + offset, page, pgoff + offset,
-				btrfs_submit_data_read_bio);
+				true);
 		if (!ret) {
 			/*
 			 * We have submitted the read repair, the page release
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index a5ec1475988f..321680f229c6 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -70,10 +70,6 @@ struct extent_io_tree;
 int __init extent_buffer_init_cachep(void);
 void __cold extent_buffer_free_cachep(void);
 
-typedef void (submit_bio_hook_t)(struct inode *inode, struct bio *bio,
-					 int mirror_num,
-					 enum btrfs_compression_type compress_type);
-
 typedef blk_status_t (extent_submit_bio_start_t)(struct inode *inode,
 		struct bio *bio, u64 dio_file_offset);
 
@@ -277,7 +273,7 @@ struct io_failure_record {
 
 int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
 			    u32 bio_offset, struct page *page, unsigned int pgoff,
-			    submit_bio_hook_t *submit_bio_hook);
+			    bool submit_buffered);
 void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end);
 int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start,
 			   struct page *page, unsigned int pg_offset);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1422f072a7b8..8cc1283fb347 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7923,9 +7923,9 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
 	bio_endio(&dip->bio);
 }
 
-static void submit_dio_repair_bio(struct inode *inode, struct bio *bio,
-				  int mirror_num,
-				  enum btrfs_compression_type compress_type)
+void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio,
+				 int mirror_num,
+				 enum btrfs_compression_type compress_type)
 {
 	struct btrfs_dio_private *dip = btrfs_bio(bio)->private;
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -7960,8 +7960,7 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
 			int ret;
 
 			ret = btrfs_repair_one_sector(inode, bbio, offset,
-					bv.bv_page, bv.bv_offset,
-					submit_dio_repair_bio);
+					bv.bv_page, bv.bv_offset, false);
 			if (ret)
 				err = errno_to_blk_status(ret);
 		}
-- 
cgit 


From 7920b773bd8a458c05b2ba4581fe19c5704fffd7 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 02:07:10 +0200
Subject: btrfs: drop parameter compression_type from
 btrfs_submit_dio_repair_bio

Compression and direct io don't work together so the compression
parameter can be dropped after previous patch that changed the call
to direct.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h | 4 +---
 fs/btrfs/extent_io.c   | 2 +-
 fs/btrfs/inode.c       | 4 +---
 3 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index fa0c72cabd8f..62019d7c1cbd 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -414,9 +414,7 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags,
 void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num);
 void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
 			int mirror_num, enum btrfs_compression_type compress_type);
-void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio,
-				 int mirror_num,
-				 enum btrfs_compression_type compress_type);
+void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, int mirror_num);
 int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
 			    u32 pgoff, u8 *csum, const u8 * const csum_expected);
 int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 44ff41304247..cc05ae772fa5 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -863,7 +863,7 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
 	if (submit_buffered)
 		btrfs_submit_data_read_bio(inode, repair_bio, failrec->this_mirror, 0);
 	else
-		btrfs_submit_dio_repair_bio(inode, repair_bio, failrec->this_mirror, 0);
+		btrfs_submit_dio_repair_bio(inode, repair_bio, failrec->this_mirror);
 
 	return BLK_STS_OK;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8cc1283fb347..74ed0afe4c2f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7923,9 +7923,7 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
 	bio_endio(&dip->bio);
 }
 
-void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio,
-				 int mirror_num,
-				 enum btrfs_compression_type compress_type)
+void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, int mirror_num)
 {
 	struct btrfs_dio_private *dip = btrfs_bio(bio)->private;
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-- 
cgit 


From ab2072b2921eced166dbdec1d65b0284b6eba2b9 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 02:22:19 +0200
Subject: btrfs: change how submit bio callback is passed to
 btrfs_wq_submit_bio

There's a callback function parameter for btrfs_wq_submit_bio that can
be one of: metadata, buffered data, direct io data. The callback
abstraction is unnecessary as we have all functions available.

Replace the parameter with a command that leads to a direct call in
run_one_async_start. The called functions can be then simplified and we
can also remove the extent_submit_bio_start_t typedef.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h |  5 +++++
 fs/btrfs/disk-io.c     | 33 +++++++++++++++++++++++----------
 fs/btrfs/disk-io.h     | 12 ++++++++++--
 fs/btrfs/extent_io.h   |  3 ---
 fs/btrfs/inode.c       | 15 +++++++--------
 5 files changed, 45 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 62019d7c1cbd..72cf235b7beb 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -415,6 +415,11 @@ void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirro
 void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
 			int mirror_num, enum btrfs_compression_type compress_type);
 void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, int mirror_num);
+blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
+				    u64 dio_file_offset);
+blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
+					      struct bio *bio,
+					      u64 dio_file_offset);
 int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
 			    u32 pgoff, u8 *csum, const u8 * const csum_expected);
 int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e5687bdff150..394dfe5ff567 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -86,10 +86,10 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
 struct async_submit_bio {
 	struct inode *inode;
 	struct bio *bio;
-	extent_submit_bio_start_t *submit_bio_start;
+	enum btrfs_wq_submit_cmd submit_cmd;
 	int mirror_num;
 
-	/* Optional parameter for submit_bio_start used by direct io */
+	/* Optional parameter for used by direct io */
 	u64 dio_file_offset;
 	struct btrfs_work work;
 	blk_status_t status;
@@ -637,8 +637,22 @@ static void run_one_async_start(struct btrfs_work *work)
 	blk_status_t ret;
 
 	async = container_of(work, struct  async_submit_bio, work);
-	ret = async->submit_bio_start(async->inode, async->bio,
-				      async->dio_file_offset);
+	switch (async->submit_cmd) {
+	case WQ_SUBMIT_METADATA:
+		ret = btree_submit_bio_start(async->inode, async->bio,
+					     async->dio_file_offset);
+		break;
+	case WQ_SUBMIT_DATA:
+		ret = btrfs_submit_bio_start(async->inode, async->bio,
+					     async->dio_file_offset);
+
+		break;
+	case WQ_SUBMIT_DATA_DIO:
+		ret = btrfs_submit_bio_start_direct_io(async->inode, async->bio,
+						       async->dio_file_offset);
+
+		break;
+	}
 	if (ret)
 		async->status = ret;
 }
@@ -689,8 +703,7 @@ static void run_one_async_free(struct btrfs_work *work)
  * - false in case of error
  */
 bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num,
-			 u64 dio_file_offset,
-			 extent_submit_bio_start_t *submit_bio_start)
+			 u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd)
 {
 	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
 	struct async_submit_bio *async;
@@ -702,7 +715,7 @@ bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num,
 	async->inode = inode;
 	async->bio = bio;
 	async->mirror_num = mirror_num;
-	async->submit_bio_start = submit_bio_start;
+	async->submit_cmd = cmd;
 
 	btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
 			run_one_async_free);
@@ -736,8 +749,8 @@ static blk_status_t btree_csum_one_bio(struct bio *bio)
 	return errno_to_blk_status(ret);
 }
 
-static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio,
-					   u64 dio_file_offset)
+blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio,
+				    u64 dio_file_offset)
 {
 	/*
 	 * when we're called for a write, we're already in the async
@@ -776,7 +789,7 @@ void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_
 	 * happen in parallel across all CPUs.
 	 */
 	if (should_async_write(fs_info, BTRFS_I(inode)) &&
-	    btrfs_wq_submit_bio(inode, bio, mirror_num, 0, btree_submit_bio_start))
+	    btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_METADATA))
 		return;
 
 	ret = btree_csum_one_bio(bio);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 6edc66b4b4d3..5998b2589830 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -113,9 +113,17 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
 			  int atomic);
 int btrfs_read_extent_buffer(struct extent_buffer *buf, u64 parent_transid,
 			     int level, struct btrfs_key *first_key);
+
+enum btrfs_wq_submit_cmd {
+	WQ_SUBMIT_METADATA,
+	WQ_SUBMIT_DATA,
+	WQ_SUBMIT_DATA_DIO,
+};
+
 bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num,
-			 u64 dio_file_offset,
-			 extent_submit_bio_start_t *submit_bio_start);
+			 u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd);
+blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio,
+				    u64 dio_file_offset);
 int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root);
 int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 321680f229c6..b3d4b568fe33 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -70,9 +70,6 @@ struct extent_io_tree;
 int __init extent_buffer_init_cachep(void);
 void __cold extent_buffer_free_cachep(void);
 
-typedef blk_status_t (extent_submit_bio_start_t)(struct inode *inode,
-		struct bio *bio, u64 dio_file_offset);
-
 #define INLINE_EXTENT_BUFFER_PAGES     (BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE)
 struct extent_buffer {
 	u64 start;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 74ed0afe4c2f..06931c20ebfa 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2550,8 +2550,8 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
  * At IO completion time the cums attached on the ordered extent record
  * are inserted into the btree
  */
-static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
-					   u64 dio_file_offset)
+blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
+				    u64 dio_file_offset)
 {
 	return btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false);
 }
@@ -2758,8 +2758,7 @@ void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirro
 	    !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) &&
 	    !btrfs_is_data_reloc_root(bi->root)) {
 		if (!atomic_read(&bi->sync_writers) &&
-		    btrfs_wq_submit_bio(inode, bio, mirror_num, 0,
-					btrfs_submit_bio_start))
+		    btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_DATA))
 			return;
 
 		ret = btrfs_csum_one_bio(bi, bio, (u64)-1, false);
@@ -7967,9 +7966,9 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
 	return err;
 }
 
-static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
-						     struct bio *bio,
-						     u64 dio_file_offset)
+blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
+					      struct bio *bio,
+					      u64 dio_file_offset)
 {
 	return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, false);
 }
@@ -8017,7 +8016,7 @@ static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
 		/* Check btrfs_submit_data_write_bio() for async submit rules */
 		if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers) &&
 		    btrfs_wq_submit_bio(inode, bio, 0, file_offset,
-					btrfs_submit_bio_start_direct_io))
+					WQ_SUBMIT_DATA_DIO))
 			return;
 
 		/*
-- 
cgit 


From ad65ecf30b037549a17d2e402fac8d39242073d1 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 02:30:45 +0200
Subject: btrfs: simplify btree_submit_bio_start and btrfs_submit_bio_start
 parameters

After previous patches the unused parameters can be removed from
btree_submit_bio_start and btrfs_submit_bio_start as they don't need to
conform to the extent_submit_bio_start_t typedef.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h |  3 +--
 fs/btrfs/disk-io.c     | 11 +++--------
 fs/btrfs/disk-io.h     |  3 +--
 fs/btrfs/inode.c       |  3 +--
 4 files changed, 6 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 72cf235b7beb..54bf002e0013 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -415,8 +415,7 @@ void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirro
 void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
 			int mirror_num, enum btrfs_compression_type compress_type);
 void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, int mirror_num);
-blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
-				    u64 dio_file_offset);
+blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio);
 blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
 					      struct bio *bio,
 					      u64 dio_file_offset);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 394dfe5ff567..7bc0fee4da13 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -639,18 +639,14 @@ static void run_one_async_start(struct btrfs_work *work)
 	async = container_of(work, struct  async_submit_bio, work);
 	switch (async->submit_cmd) {
 	case WQ_SUBMIT_METADATA:
-		ret = btree_submit_bio_start(async->inode, async->bio,
-					     async->dio_file_offset);
+		ret = btree_submit_bio_start(async->bio);
 		break;
 	case WQ_SUBMIT_DATA:
-		ret = btrfs_submit_bio_start(async->inode, async->bio,
-					     async->dio_file_offset);
-
+		ret = btrfs_submit_bio_start(async->inode, async->bio);
 		break;
 	case WQ_SUBMIT_DATA_DIO:
 		ret = btrfs_submit_bio_start_direct_io(async->inode, async->bio,
 						       async->dio_file_offset);
-
 		break;
 	}
 	if (ret)
@@ -749,8 +745,7 @@ static blk_status_t btree_csum_one_bio(struct bio *bio)
 	return errno_to_blk_status(ret);
 }
 
-blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio,
-				    u64 dio_file_offset)
+blk_status_t btree_submit_bio_start(struct bio *bio)
 {
 	/*
 	 * when we're called for a write, we're already in the async
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 5998b2589830..d5b25fa8037b 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -122,8 +122,7 @@ enum btrfs_wq_submit_cmd {
 
 bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num,
 			 u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd);
-blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio,
-				    u64 dio_file_offset);
+blk_status_t btree_submit_bio_start(struct bio *bio);
 int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root);
 int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 06931c20ebfa..cd9325f51ffd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2550,8 +2550,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
  * At IO completion time the cums attached on the ordered extent record
  * are inserted into the btree
  */
-blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
-				    u64 dio_file_offset)
+blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio)
 {
 	return btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false);
 }
-- 
cgit 


From da67daab8dd679b9a10dee86520c530011342a8d Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 02:40:36 +0200
Subject: btrfs: switch async_submit_bio::inode to btrfs_inode

The async bio submit is for internal interfaces so we should use the
btrfs_inode.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7bc0fee4da13..44806ccad43c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -84,7 +84,7 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
  * just before they are sent down the IO stack.
  */
 struct async_submit_bio {
-	struct inode *inode;
+	struct btrfs_inode *inode;
 	struct bio *bio;
 	enum btrfs_wq_submit_cmd submit_cmd;
 	int mirror_num;
@@ -642,11 +642,11 @@ static void run_one_async_start(struct btrfs_work *work)
 		ret = btree_submit_bio_start(async->bio);
 		break;
 	case WQ_SUBMIT_DATA:
-		ret = btrfs_submit_bio_start(async->inode, async->bio);
+		ret = btrfs_submit_bio_start(&async->inode->vfs_inode, async->bio);
 		break;
 	case WQ_SUBMIT_DATA_DIO:
-		ret = btrfs_submit_bio_start_direct_io(async->inode, async->bio,
-						       async->dio_file_offset);
+		ret = btrfs_submit_bio_start_direct_io(&async->inode->vfs_inode,
+				async->bio, async->dio_file_offset);
 		break;
 	}
 	if (ret)
@@ -665,7 +665,7 @@ static void run_one_async_done(struct btrfs_work *work)
 {
 	struct async_submit_bio *async =
 		container_of(work, struct  async_submit_bio, work);
-	struct inode *inode = async->inode;
+	struct btrfs_inode *inode = async->inode;
 	struct btrfs_bio *bbio = btrfs_bio(async->bio);
 
 	/* If an error occurred we just want to clean up the bio and move on */
@@ -680,7 +680,7 @@ static void run_one_async_done(struct btrfs_work *work)
 	 * This changes nothing when cgroups aren't in use.
 	 */
 	async->bio->bi_opf |= REQ_CGROUP_PUNT;
-	btrfs_submit_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num);
+	btrfs_submit_bio(inode->root->fs_info, async->bio, async->mirror_num);
 }
 
 static void run_one_async_free(struct btrfs_work *work)
@@ -708,7 +708,7 @@ bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num,
 	if (!async)
 		return false;
 
-	async->inode = inode;
+	async->inode = BTRFS_I(inode);
 	async->bio = bio;
 	async->mirror_num = mirror_num;
 	async->submit_cmd = cmd;
-- 
cgit 


From 882681ac98837aa7683a59020c0c98d05479805f Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 02:41:32 +0200
Subject: btrfs: pass btrfs_inode to btrfs_submit_bio_start

The function is for internal interfaces so we should use the
btrfs_inode.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h | 2 +-
 fs/btrfs/disk-io.c     | 2 +-
 fs/btrfs/inode.c       | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 54bf002e0013..4ec6a74dd6ba 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -415,7 +415,7 @@ void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirro
 void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
 			int mirror_num, enum btrfs_compression_type compress_type);
 void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, int mirror_num);
-blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio);
+blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio);
 blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
 					      struct bio *bio,
 					      u64 dio_file_offset);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 44806ccad43c..3e42fab48e6b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -642,7 +642,7 @@ static void run_one_async_start(struct btrfs_work *work)
 		ret = btree_submit_bio_start(async->bio);
 		break;
 	case WQ_SUBMIT_DATA:
-		ret = btrfs_submit_bio_start(&async->inode->vfs_inode, async->bio);
+		ret = btrfs_submit_bio_start(async->inode, async->bio);
 		break;
 	case WQ_SUBMIT_DATA_DIO:
 		ret = btrfs_submit_bio_start_direct_io(&async->inode->vfs_inode,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index cd9325f51ffd..aa55cf6e763e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2550,9 +2550,9 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
  * At IO completion time the cums attached on the ordered extent record
  * are inserted into the btree
  */
-blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio)
+blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio)
 {
-	return btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false);
+	return btrfs_csum_one_bio(inode, bio, (u64)-1, false);
 }
 
 /*
-- 
cgit 


From bfa17066822cead4fb8a6a5d4a214a2c527e4614 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 02:41:32 +0200
Subject: btrfs: pass btrfs_inode to btrfs_submit_bio_start_direct_io

The function is for internal interfaces so we should use the
btrfs_inode.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h | 2 +-
 fs/btrfs/disk-io.c     | 2 +-
 fs/btrfs/inode.c       | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 4ec6a74dd6ba..a41d4f953bfa 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -416,7 +416,7 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
 			int mirror_num, enum btrfs_compression_type compress_type);
 void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, int mirror_num);
 blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio);
-blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
+blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode,
 					      struct bio *bio,
 					      u64 dio_file_offset);
 int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3e42fab48e6b..5aacb88069a7 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -645,7 +645,7 @@ static void run_one_async_start(struct btrfs_work *work)
 		ret = btrfs_submit_bio_start(async->inode, async->bio);
 		break;
 	case WQ_SUBMIT_DATA_DIO:
-		ret = btrfs_submit_bio_start_direct_io(&async->inode->vfs_inode,
+		ret = btrfs_submit_bio_start_direct_io(async->inode,
 				async->bio, async->dio_file_offset);
 		break;
 	}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index aa55cf6e763e..06ec84be4b51 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7965,11 +7965,11 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
 	return err;
 }
 
-blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
+blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode,
 					      struct bio *bio,
 					      u64 dio_file_offset)
 {
-	return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, false);
+	return btrfs_csum_one_bio(inode, bio, dio_file_offset, false);
 }
 
 static void btrfs_end_dio_bio(struct btrfs_bio *bbio)
-- 
cgit 


From 5fcdadc2704534ef3a7b99aef65d54183c723493 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 02:41:32 +0200
Subject: btrfs: pass btrfs_inode to btrfs_wq_submit_bio

The function is for internal interfaces so we should use the
btrfs_inode.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 8 ++++----
 fs/btrfs/disk-io.h | 2 +-
 fs/btrfs/inode.c   | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5aacb88069a7..65c168d6ed77 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -698,17 +698,17 @@ static void run_one_async_free(struct btrfs_work *work)
  * - true if the work has been succesfuly submitted
  * - false in case of error
  */
-bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num,
+bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num,
 			 u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd)
 {
-	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	struct async_submit_bio *async;
 
 	async = kmalloc(sizeof(*async), GFP_NOFS);
 	if (!async)
 		return false;
 
-	async->inode = BTRFS_I(inode);
+	async->inode = inode;
 	async->bio = bio;
 	async->mirror_num = mirror_num;
 	async->submit_cmd = cmd;
@@ -784,7 +784,7 @@ void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_
 	 * happen in parallel across all CPUs.
 	 */
 	if (should_async_write(fs_info, BTRFS_I(inode)) &&
-	    btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_METADATA))
+	    btrfs_wq_submit_bio(BTRFS_I(inode), bio, mirror_num, 0, WQ_SUBMIT_METADATA))
 		return;
 
 	ret = btree_csum_one_bio(bio);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index d5b25fa8037b..65cf976b74e8 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -120,7 +120,7 @@ enum btrfs_wq_submit_cmd {
 	WQ_SUBMIT_DATA_DIO,
 };
 
-bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num,
+bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num,
 			 u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd);
 blk_status_t btree_submit_bio_start(struct bio *bio);
 int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 06ec84be4b51..a7cbbfc22329 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2757,7 +2757,7 @@ void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirro
 	    !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) &&
 	    !btrfs_is_data_reloc_root(bi->root)) {
 		if (!atomic_read(&bi->sync_writers) &&
-		    btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_DATA))
+		    btrfs_wq_submit_bio(bi, bio, mirror_num, 0, WQ_SUBMIT_DATA))
 			return;
 
 		ret = btrfs_csum_one_bio(bi, bio, (u64)-1, false);
@@ -8014,7 +8014,7 @@ static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
 	if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
 		/* Check btrfs_submit_data_write_bio() for async submit rules */
 		if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers) &&
-		    btrfs_wq_submit_bio(inode, bio, 0, file_offset,
+		    btrfs_wq_submit_bio(BTRFS_I(inode), bio, 0, file_offset,
 					WQ_SUBMIT_DATA_DIO))
 			return;
 
-- 
cgit 


From 644094fd285499b484e50a8e986965b550a0924e Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 02:41:32 +0200
Subject: btrfs: pass btrfs_inode to btrfs_submit_metadata_bio

The function is for internal interfaces so we should use the
btrfs_inode.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c   | 8 ++++----
 fs/btrfs/disk-io.h   | 2 +-
 fs/btrfs/extent_io.c | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 65c168d6ed77..72e3446d9840 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -766,9 +766,9 @@ static bool should_async_write(struct btrfs_fs_info *fs_info,
 	return true;
 }
 
-void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num)
+void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	struct btrfs_bio *bbio = btrfs_bio(bio);
 	blk_status_t ret;
 
@@ -783,8 +783,8 @@ void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_
 	 * Kthread helpers are used to submit writes so that checksumming can
 	 * happen in parallel across all CPUs.
 	 */
-	if (should_async_write(fs_info, BTRFS_I(inode)) &&
-	    btrfs_wq_submit_bio(BTRFS_I(inode), bio, mirror_num, 0, WQ_SUBMIT_METADATA))
+	if (should_async_write(fs_info, inode) &&
+	    btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_METADATA))
 		return;
 
 	ret = btree_csum_one_bio(bio);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 65cf976b74e8..f2c507fd0e04 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -86,7 +86,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
 int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
 				   struct page *page, u64 start, u64 end,
 				   int mirror);
-void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num);
+void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num);
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
 #endif
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index cc05ae772fa5..e770cbc5cb6a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -134,7 +134,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
 	btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset;
 
 	if (!is_data_inode(inode))
-		btrfs_submit_metadata_bio(inode, bio, mirror_num);
+		btrfs_submit_metadata_bio(BTRFS_I(inode), bio, mirror_num);
 	else if (btrfs_op(bio) == BTRFS_MAP_WRITE)
 		btrfs_submit_data_write_bio(inode, bio, mirror_num);
 	else
-- 
cgit 


From 535a7e5d6b7e04da0336c632c834a575e82b5082 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 02:41:32 +0200
Subject: btrfs: pass btrfs_inode to btrfs_submit_data_write_bio

The function is for internal interfaces so we should use the
btrfs_inode.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h |  2 +-
 fs/btrfs/extent_io.c   |  2 +-
 fs/btrfs/inode.c       | 17 ++++++++---------
 3 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index a41d4f953bfa..01fc62d39ed2 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -411,7 +411,7 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags,
 #define CSUM_FMT				"0x%*phN"
 #define CSUM_FMT_VALUE(size, bytes)		size, bytes
 
-void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num);
+void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num);
 void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
 			int mirror_num, enum btrfs_compression_type compress_type);
 void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, int mirror_num);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e770cbc5cb6a..13fba51be32d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -136,7 +136,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
 	if (!is_data_inode(inode))
 		btrfs_submit_metadata_bio(BTRFS_I(inode), bio, mirror_num);
 	else if (btrfs_op(bio) == BTRFS_MAP_WRITE)
-		btrfs_submit_data_write_bio(inode, bio, mirror_num);
+		btrfs_submit_data_write_bio(BTRFS_I(inode), bio, mirror_num);
 	else
 		btrfs_submit_data_read_bio(inode, bio, mirror_num,
 					   bio_ctrl->compress_type);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a7cbbfc22329..538f55088bdf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2730,14 +2730,13 @@ out:
 	return errno_to_blk_status(ret);
 }
 
-void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num)
+void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-	struct btrfs_inode *bi = BTRFS_I(inode);
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	blk_status_t ret;
 
 	if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
-		ret = extract_ordered_extent(bi, bio,
+		ret = extract_ordered_extent(inode, bio,
 				page_offset(bio_first_bvec_all(bio)->bv_page));
 		if (ret) {
 			btrfs_bio_end_io(btrfs_bio(bio), ret);
@@ -2753,14 +2752,14 @@ void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirro
 	 * Csum items for reloc roots have already been cloned at this point,
 	 * so they are handled as part of the no-checksum case.
 	 */
-	if (!(bi->flags & BTRFS_INODE_NODATASUM) &&
+	if (!(inode->flags & BTRFS_INODE_NODATASUM) &&
 	    !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) &&
-	    !btrfs_is_data_reloc_root(bi->root)) {
-		if (!atomic_read(&bi->sync_writers) &&
-		    btrfs_wq_submit_bio(bi, bio, mirror_num, 0, WQ_SUBMIT_DATA))
+	    !btrfs_is_data_reloc_root(inode->root)) {
+		if (!atomic_read(&inode->sync_writers) &&
+		    btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_DATA))
 			return;
 
-		ret = btrfs_csum_one_bio(bi, bio, (u64)-1, false);
+		ret = btrfs_csum_one_bio(inode, bio, (u64)-1, false);
 		if (ret) {
 			btrfs_bio_end_io(btrfs_bio(bio), ret);
 			return;
-- 
cgit 


From b762041629e73d8e24a10aae4bf256774fbbd082 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 02:41:32 +0200
Subject: btrfs: pass btrfs_inode to btrfs_submit_data_read_bio

The function is for internal interfaces so we should use the
btrfs_inode.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h | 2 +-
 fs/btrfs/extent_io.c   | 5 +++--
 fs/btrfs/inode.c       | 8 ++++----
 3 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 01fc62d39ed2..5406a2f817b2 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -412,7 +412,7 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags,
 #define CSUM_FMT_VALUE(size, bytes)		size, bytes
 
 void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num);
-void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
+void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio,
 			int mirror_num, enum btrfs_compression_type compress_type);
 void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, int mirror_num);
 blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 13fba51be32d..ac69e5f1605c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -138,7 +138,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
 	else if (btrfs_op(bio) == BTRFS_MAP_WRITE)
 		btrfs_submit_data_write_bio(BTRFS_I(inode), bio, mirror_num);
 	else
-		btrfs_submit_data_read_bio(inode, bio, mirror_num,
+		btrfs_submit_data_read_bio(BTRFS_I(inode), bio, mirror_num,
 					   bio_ctrl->compress_type);
 
 	/* The bio is owned by the end_io handler now */
@@ -861,7 +861,8 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
 	 * error here.
 	 */
 	if (submit_buffered)
-		btrfs_submit_data_read_bio(inode, repair_bio, failrec->this_mirror, 0);
+		btrfs_submit_data_read_bio(BTRFS_I(inode), repair_bio,
+					   failrec->this_mirror, 0);
 	else
 		btrfs_submit_dio_repair_bio(inode, repair_bio, failrec->this_mirror);
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 538f55088bdf..e3bbd19edea3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2768,10 +2768,10 @@ void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int
 	btrfs_submit_bio(fs_info, bio, mirror_num);
 }
 
-void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
+void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio,
 			int mirror_num, enum btrfs_compression_type compress_type)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	blk_status_t ret;
 
 	if (compress_type != BTRFS_COMPRESS_NONE) {
@@ -2779,7 +2779,7 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
 		 * btrfs_submit_compressed_read will handle completing the bio
 		 * if there were any errors, so just return here.
 		 */
-		btrfs_submit_compressed_read(inode, bio, mirror_num);
+		btrfs_submit_compressed_read(&inode->vfs_inode, bio, mirror_num);
 		return;
 	}
 
@@ -2790,7 +2790,7 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
 	 * Lookup bio sums does extra checks around whether we need to csum or
 	 * not, which is why we ignore skip_sum here.
 	 */
-	ret = btrfs_lookup_bio_sums(inode, bio, NULL);
+	ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL);
 	if (ret) {
 		btrfs_bio_end_io(btrfs_bio(bio), ret);
 		return;
-- 
cgit 


From d781c1c315ce649002a9fd0387edc3ee93271743 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 02:41:32 +0200
Subject: btrfs: pass btrfs_inode to btrfs_submit_dio_repair_bio

The function is for internal interfaces so we should use the
btrfs_inode.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h | 2 +-
 fs/btrfs/extent_io.c   | 3 ++-
 fs/btrfs/inode.c       | 5 ++---
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 5406a2f817b2..9fe1a11a2eb3 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -414,7 +414,7 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags,
 void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num);
 void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio,
 			int mirror_num, enum btrfs_compression_type compress_type);
-void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, int mirror_num);
+void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num);
 blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio);
 blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode,
 					      struct bio *bio,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ac69e5f1605c..bddda397e438 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -864,7 +864,8 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
 		btrfs_submit_data_read_bio(BTRFS_I(inode), repair_bio,
 					   failrec->this_mirror, 0);
 	else
-		btrfs_submit_dio_repair_bio(inode, repair_bio, failrec->this_mirror);
+		btrfs_submit_dio_repair_bio(BTRFS_I(inode), repair_bio,
+					    failrec->this_mirror);
 
 	return BLK_STS_OK;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e3bbd19edea3..f0b4f3a9245f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7920,15 +7920,14 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
 	bio_endio(&dip->bio);
 }
 
-void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, int mirror_num)
+void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num)
 {
 	struct btrfs_dio_private *dip = btrfs_bio(bio)->private;
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 
 	BUG_ON(bio_op(bio) == REQ_OP_WRITE);
 
 	refcount_inc(&dip->refs);
-	btrfs_submit_bio(fs_info, bio, mirror_num);
+	btrfs_submit_bio(inode->root->fs_info, bio, mirror_num);
 }
 
 static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
-- 
cgit 


From c5ca391b0dd82d4c0457159286bd19f2364a70be Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 02:41:32 +0200
Subject: btrfs: pass btrfs_inode to submit_one_bio

The function is for internal interfaces so we should use the
btrfs_inode.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index bddda397e438..192f604ac7ea 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -117,7 +117,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
 {
 	struct bio *bio;
 	struct bio_vec *bv;
-	struct inode *inode;
+	struct btrfs_inode *inode;
 	int mirror_num;
 
 	if (!bio_ctrl->bio)
@@ -125,7 +125,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
 
 	bio = bio_ctrl->bio;
 	bv = bio_first_bvec_all(bio);
-	inode = bv->bv_page->mapping->host;
+	inode = BTRFS_I(bv->bv_page->mapping->host);
 	mirror_num = bio_ctrl->mirror_num;
 
 	/* Caller should ensure the bio has at least some range added */
@@ -133,12 +133,12 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
 
 	btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset;
 
-	if (!is_data_inode(inode))
-		btrfs_submit_metadata_bio(BTRFS_I(inode), bio, mirror_num);
+	if (!is_data_inode(&inode->vfs_inode))
+		btrfs_submit_metadata_bio(inode, bio, mirror_num);
 	else if (btrfs_op(bio) == BTRFS_MAP_WRITE)
-		btrfs_submit_data_write_bio(BTRFS_I(inode), bio, mirror_num);
+		btrfs_submit_data_write_bio(inode, bio, mirror_num);
 	else
-		btrfs_submit_data_read_bio(BTRFS_I(inode), bio, mirror_num,
+		btrfs_submit_data_read_bio(inode, bio, mirror_num,
 					   bio_ctrl->compress_type);
 
 	/* The bio is owned by the end_io handler now */
-- 
cgit 


From d8f9268ece91e6fe887b444780787828890a8051 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 02:41:32 +0200
Subject: btrfs: pass btrfs_inode to btrfs_repair_one_sector

The function is for internal interfaces so we should use the
btrfs_inode.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c |  2 +-
 fs/btrfs/extent_io.c   | 17 ++++++++---------
 fs/btrfs/extent_io.h   |  2 +-
 fs/btrfs/inode.c       |  2 +-
 4 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index cf3dc7e501ec..6f5ad0d6c409 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -194,7 +194,7 @@ static void end_compressed_bio_read(struct btrfs_bio *bbio)
 			int ret;
 
 			refcount_inc(&cb->pending_ios);
-			ret = btrfs_repair_one_sector(inode, bbio, offset,
+			ret = btrfs_repair_one_sector(BTRFS_I(inode), bbio, offset,
 						      bv.bv_page, bv.bv_offset,
 						      true);
 			if (ret) {
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 192f604ac7ea..05768f7f7872 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -795,13 +795,13 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
 	return failrec;
 }
 
-int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
+int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_bbio,
 			    u32 bio_offset, struct page *page, unsigned int pgoff,
 			    bool submit_buffered)
 {
 	u64 start = failed_bbio->file_offset + bio_offset;
 	struct io_failure_record *failrec;
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	struct bio *failed_bio = &failed_bbio->bio;
 	const int icsum = bio_offset >> fs_info->sectorsize_bits;
 	struct bio *repair_bio;
@@ -812,7 +812,7 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
 
 	BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
 
-	failrec = btrfs_get_io_failure_record(inode, failed_bbio, bio_offset);
+	failrec = btrfs_get_io_failure_record(&inode->vfs_inode, failed_bbio, bio_offset);
 	if (IS_ERR(failrec))
 		return PTR_ERR(failrec);
 
@@ -830,7 +830,7 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
 		btrfs_debug(fs_info,
 			"failed to repair num_copies %d this_mirror %d failed_mirror %d",
 			failrec->num_copies, failrec->this_mirror, failrec->failed_mirror);
-		free_io_failure(BTRFS_I(inode), failrec);
+		free_io_failure(inode, failrec);
 		return -EIO;
 	}
 
@@ -851,7 +851,7 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
 	bio_add_page(repair_bio, page, failrec->len, pgoff);
 	repair_bbio->iter = repair_bio->bi_iter;
 
-	btrfs_debug(btrfs_sb(inode->i_sb),
+	btrfs_debug(fs_info,
 		    "repair read error: submitting new read to mirror %d",
 		    failrec->this_mirror);
 
@@ -861,11 +861,10 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
 	 * error here.
 	 */
 	if (submit_buffered)
-		btrfs_submit_data_read_bio(BTRFS_I(inode), repair_bio,
+		btrfs_submit_data_read_bio(inode, repair_bio,
 					   failrec->this_mirror, 0);
 	else
-		btrfs_submit_dio_repair_bio(BTRFS_I(inode), repair_bio,
-					    failrec->this_mirror);
+		btrfs_submit_dio_repair_bio(inode, repair_bio, failrec->this_mirror);
 
 	return BLK_STS_OK;
 }
@@ -955,7 +954,7 @@ static void submit_data_read_repair(struct inode *inode,
 			goto next;
 		}
 
-		ret = btrfs_repair_one_sector(inode, failed_bbio,
+		ret = btrfs_repair_one_sector(BTRFS_I(inode), failed_bbio,
 				bio_offset + offset, page, pgoff + offset,
 				true);
 		if (!ret) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index b3d4b568fe33..805e262811b4 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -268,7 +268,7 @@ struct io_failure_record {
 	int num_copies;
 };
 
-int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
+int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_bbio,
 			    u32 bio_offset, struct page *page, unsigned int pgoff,
 			    bool submit_buffered);
 void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f0b4f3a9245f..4a2fec637714 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7953,7 +7953,7 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
 		} else {
 			int ret;
 
-			ret = btrfs_repair_one_sector(inode, bbio, offset,
+			ret = btrfs_repair_one_sector(BTRFS_I(inode), bbio, offset,
 					bv.bv_page, bv.bv_offset, false);
 			if (ret)
 				err = errno_to_blk_status(ret);
-- 
cgit 


From e2884c3d445669c231fd8ca4c6fd160e729ec862 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 02:41:32 +0200
Subject: btrfs: switch btrfs_dio_private::inode to btrfs_inode

The btrfs_dio_private structure is for internal interfaces so we should
use the btrfs_inode.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4a2fec637714..02100bb9c2a2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -84,7 +84,7 @@ struct btrfs_dio_data {
 };
 
 struct btrfs_dio_private {
-	struct inode *inode;
+	struct btrfs_inode *inode;
 
 	/*
 	 * Since DIO can use anonymous page, we cannot use page_offset() to
@@ -7907,11 +7907,11 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
 		return;
 
 	if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) {
-		btrfs_mark_ordered_io_finished(BTRFS_I(dip->inode), NULL,
+		btrfs_mark_ordered_io_finished(dip->inode, NULL,
 					       dip->file_offset, dip->bytes,
 					       !dip->bio.bi_status);
 	} else {
-		unlock_extent(&BTRFS_I(dip->inode)->io_tree,
+		unlock_extent(&dip->inode->io_tree,
 			      dip->file_offset,
 			      dip->file_offset + dip->bytes - 1, NULL);
 	}
@@ -7934,7 +7934,7 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
 					     struct btrfs_bio *bbio,
 					     const bool uptodate)
 {
-	struct inode *inode = dip->inode;
+	struct inode *inode = &dip->inode->vfs_inode;
 	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
 	const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
 	blk_status_t err = BLK_STS_OK;
@@ -7977,9 +7977,9 @@ static void btrfs_end_dio_bio(struct btrfs_bio *bbio)
 	blk_status_t err = bio->bi_status;
 
 	if (err)
-		btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
+		btrfs_warn(dip->inode->root->fs_info,
 			   "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d",
-			   btrfs_ino(BTRFS_I(dip->inode)), bio_op(bio),
+			   btrfs_ino(dip->inode), bio_op(bio),
 			   bio->bi_opf, bio->bi_iter.bi_sector,
 			   bio->bi_iter.bi_size, err);
 
@@ -7989,7 +7989,7 @@ static void btrfs_end_dio_bio(struct btrfs_bio *bbio)
 	if (err)
 		dip->bio.bi_status = err;
 
-	btrfs_record_physical_zoned(dip->inode, bbio->file_offset, bio);
+	btrfs_record_physical_zoned(&dip->inode->vfs_inode, bbio->file_offset, bio);
 
 	bio_put(bio);
 	btrfs_dio_private_put(dip);
@@ -8056,7 +8056,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter,
 	struct btrfs_dio_data *dio_data = iter->private;
 	struct extent_map *em = NULL;
 
-	dip->inode = inode;
+	dip->inode = BTRFS_I(inode);
 	dip->file_offset = file_offset;
 	dip->bytes = dio_bio->bi_iter.bi_size;
 	refcount_set(&dip->refs, 1);
-- 
cgit 


From bb41632ea7d252855f7a12f269afefc18a84e07b Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 02:41:32 +0200
Subject: btrfs: pass btrfs_inode to btrfs_submit_dio_bio

The function is for internal interfaces so we should use the
btrfs_inode.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 02100bb9c2a2..b1369f645f1b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7995,10 +7995,10 @@ static void btrfs_end_dio_bio(struct btrfs_bio *bbio)
 	btrfs_dio_private_put(dip);
 }
 
-static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
+static void btrfs_submit_dio_bio(struct bio *bio, struct btrfs_inode *inode,
 				 u64 file_offset, int async_submit)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	struct btrfs_dio_private *dip = btrfs_bio(bio)->private;
 	blk_status_t ret;
 
@@ -8006,13 +8006,13 @@ static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
 	if (btrfs_op(bio) == BTRFS_MAP_READ)
 		btrfs_bio(bio)->iter = bio->bi_iter;
 
-	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
+	if (inode->flags & BTRFS_INODE_NODATASUM)
 		goto map;
 
 	if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
 		/* Check btrfs_submit_data_write_bio() for async submit rules */
-		if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers) &&
-		    btrfs_wq_submit_bio(BTRFS_I(inode), bio, 0, file_offset,
+		if (async_submit && !atomic_read(&inode->sync_writers) &&
+		    btrfs_wq_submit_bio(inode, bio, 0, file_offset,
 					WQ_SUBMIT_DATA_DIO))
 			return;
 
@@ -8020,7 +8020,7 @@ static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
 		 * If we aren't doing async submit, calculate the csum of the
 		 * bio now.
 		 */
-		ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false);
+		ret = btrfs_csum_one_bio(inode, bio, file_offset, false);
 		if (ret) {
 			btrfs_bio_end_io(btrfs_bio(bio), ret);
 			return;
@@ -8142,7 +8142,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter,
 				async_submit = 1;
 		}
 
-		btrfs_submit_dio_bio(bio, inode, file_offset, async_submit);
+		btrfs_submit_dio_bio(bio, BTRFS_I(inode), file_offset, async_submit);
 
 		dio_data->submitted += clone_len;
 		clone_offset += clone_len;
-- 
cgit 


From d9dcae67b7fe1f8c7d974a7a002c81d9b02b7281 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 02:41:32 +0200
Subject: btrfs: pass btrfs_inode to btrfs_truncate

The function is for internal interfaces so we should use the
btrfs_inode.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 41 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b1369f645f1b..f612c8b9d4bc 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -124,7 +124,7 @@ static const struct file_operations btrfs_dir_file_operations;
 static struct kmem_cache *btrfs_inode_cachep;
 
 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
-static int btrfs_truncate(struct inode *inode, bool skip_writeback);
+static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback);
 static noinline int cow_file_range(struct btrfs_inode *inode,
 				   struct page *locked_page,
 				   u64 start, u64 end, int *page_started,
@@ -5270,7 +5270,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 
 		inode_dio_wait(inode);
 
-		ret = btrfs_truncate(inode, newsize == oldsize);
+		ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize);
 		if (ret && inode->i_nlink) {
 			int err;
 
@@ -8632,16 +8632,16 @@ out_noreserve:
 	return ret;
 }
 
-static int btrfs_truncate(struct inode *inode, bool skip_writeback)
+static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
 {
 	struct btrfs_truncate_control control = {
-		.inode = BTRFS_I(inode),
-		.ino = btrfs_ino(BTRFS_I(inode)),
+		.inode = inode,
+		.ino = btrfs_ino(inode),
 		.min_type = BTRFS_EXTENT_DATA_KEY,
 		.clear_extent_range = true,
 	};
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_root *root = inode->root;
+	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_block_rsv *rsv;
 	int ret;
 	struct btrfs_trans_handle *trans;
@@ -8649,7 +8649,8 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
 	u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
 
 	if (!skip_writeback) {
-		ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
+		ret = btrfs_wait_ordered_range(&inode->vfs_inode,
+					       inode->vfs_inode.i_size & (~mask),
 					       (u64)-1);
 		if (ret)
 			return ret;
@@ -8708,34 +8709,32 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
 
 	while (1) {
 		struct extent_state *cached_state = NULL;
-		const u64 new_size = inode->i_size;
+		const u64 new_size = inode->vfs_inode.i_size;
 		const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
 
 		control.new_size = new_size;
-		lock_extent(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
-				 &cached_state);
+		lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
 		/*
 		 * We want to drop from the next block forward in case this new
 		 * size is not block aligned since we will be keeping the last
 		 * block of the extent just the way it is.
 		 */
-		btrfs_drop_extent_map_range(BTRFS_I(inode),
+		btrfs_drop_extent_map_range(inode,
 					    ALIGN(new_size, fs_info->sectorsize),
 					    (u64)-1, false);
 
 		ret = btrfs_truncate_inode_items(trans, root, &control);
 
-		inode_sub_bytes(inode, control.sub_bytes);
-		btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), control.last_size);
+		inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
+		btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
 
-		unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
-			      &cached_state);
+		unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
 
 		trans->block_rsv = &fs_info->trans_block_rsv;
 		if (ret != -ENOSPC && ret != -EAGAIN)
 			break;
 
-		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+		ret = btrfs_update_inode(trans, root, inode);
 		if (ret)
 			break;
 
@@ -8766,7 +8765,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
 		btrfs_end_transaction(trans);
 		btrfs_btree_balance_dirty(fs_info);
 
-		ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
+		ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, 0, 0);
 		if (ret)
 			goto out;
 		trans = btrfs_start_transaction(root, 1);
@@ -8774,14 +8773,14 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
 			ret = PTR_ERR(trans);
 			goto out;
 		}
-		btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
+		btrfs_inode_safe_disk_i_size_write(inode, 0);
 	}
 
 	if (trans) {
 		int ret2;
 
 		trans->block_rsv = &fs_info->trans_block_rsv;
-		ret2 = btrfs_update_inode(trans, root, BTRFS_I(inode));
+		ret2 = btrfs_update_inode(trans, root, inode);
 		if (ret2 && !ret)
 			ret = ret2;
 
@@ -8807,7 +8806,7 @@ out:
 	 * extents beyond i_size to drop.
 	 */
 	if (control.extents_found > 0)
-		btrfs_set_inode_full_sync(BTRFS_I(inode));
+		btrfs_set_inode_full_sync(inode);
 
 	return ret;
 }
-- 
cgit 


From 29b6352b1494bd7ec6a14bda087f8eb858d2fc1f Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 02:41:32 +0200
Subject: btrfs: pass btrfs_inode to btrfs_inode_lock

The function is for internal interfaces so we should use the
btrfs_inode.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h   |  2 +-
 fs/btrfs/defrag.c        |  4 ++--
 fs/btrfs/delayed-inode.c |  2 +-
 fs/btrfs/file.c          | 16 ++++++++--------
 fs/btrfs/inode.c         | 14 +++++++-------
 fs/btrfs/ioctl.c         |  2 +-
 fs/btrfs/reflink.c       |  2 +-
 fs/btrfs/relocation.c    |  2 +-
 8 files changed, 22 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 9fe1a11a2eb3..a06d1c0a0cc2 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -545,7 +545,7 @@ enum btrfs_ilock_type {
 	ENUM_BIT(BTRFS_ILOCK_MMAP),
 };
 
-int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags);
+int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags);
 void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags);
 void btrfs_update_inode_bytes(struct btrfs_inode *inode, const u64 add_bytes,
 			      const u64 del_bytes);
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index 919dfe0f7e50..6aade4838927 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -1295,7 +1295,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
 			       (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1;
 		cluster_end = min(cluster_end, last_byte);
 
-		btrfs_inode_lock(inode, 0);
+		btrfs_inode_lock(BTRFS_I(inode), 0);
 		if (IS_SWAPFILE(inode)) {
 			ret = -ETXTBSY;
 			btrfs_inode_unlock(inode, 0);
@@ -1351,7 +1351,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
 		ret = sectors_defragged;
 	}
 	if (do_compress) {
-		btrfs_inode_lock(inode, 0);
+		btrfs_inode_lock(BTRFS_I(inode), 0);
 		BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
 		btrfs_inode_unlock(inode, 0);
 	}
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index c024f97de9e0..4edf44d8cd9e 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1647,7 +1647,7 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode,
 	 * item->readdir_list.
 	 */
 	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
-	btrfs_inode_lock(inode, 0);
+	btrfs_inode_lock(BTRFS_I(inode), 0);
 
 	mutex_lock(&delayed_node->mutex);
 	item = __btrfs_first_delayed_insertion_item(delayed_node);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f8be9d629e75..b3b7b276cce0 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1193,7 +1193,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
 	if (nowait)
 		ilock_flags |= BTRFS_ILOCK_TRY;
 
-	ret = btrfs_inode_lock(inode, ilock_flags);
+	ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
 	if (ret < 0)
 		return ret;
 
@@ -1468,7 +1468,7 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
 		ilock_flags |= BTRFS_ILOCK_SHARED;
 
 relock:
-	err = btrfs_inode_lock(inode, ilock_flags);
+	err = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
 	if (err < 0)
 		return err;
 
@@ -1616,7 +1616,7 @@ static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
 	loff_t count;
 	ssize_t ret;
 
-	btrfs_inode_lock(inode, 0);
+	btrfs_inode_lock(BTRFS_I(inode), 0);
 	count = encoded->len;
 	ret = generic_write_checks_count(iocb, &count);
 	if (ret == 0 && count != encoded->len) {
@@ -1806,7 +1806,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	if (ret)
 		goto out;
 
-	btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
+	btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
 
 	atomic_inc(&root->log_batch);
 
@@ -2596,7 +2596,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
 	bool truncated_block = false;
 	bool updated_inode = false;
 
-	btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
+	btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
 
 	ret = btrfs_wait_ordered_range(inode, offset, len);
 	if (ret)
@@ -3054,7 +3054,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 	if (mode & FALLOC_FL_PUNCH_HOLE)
 		return btrfs_punch_hole(file, offset, len);
 
-	btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
+	btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
 
 	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
 		ret = inode_newsize_ok(inode, offset + len);
@@ -3691,7 +3691,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
 		return generic_file_llseek(file, offset, whence);
 	case SEEK_DATA:
 	case SEEK_HOLE:
-		btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
+		btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
 		offset = find_desired_extent(BTRFS_I(inode), offset, whence);
 		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
 		break;
@@ -3748,7 +3748,7 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
 	if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos))
 		return 0;
 
-	btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
+	btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
 again:
 	/*
 	 * This is similar to what we do for direct IO writes, see the comment
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f612c8b9d4bc..9b0f84c85bef 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -172,27 +172,27 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
  *		     return -EAGAIN
  * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
  */
-int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags)
+int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags)
 {
 	if (ilock_flags & BTRFS_ILOCK_SHARED) {
 		if (ilock_flags & BTRFS_ILOCK_TRY) {
-			if (!inode_trylock_shared(inode))
+			if (!inode_trylock_shared(&inode->vfs_inode))
 				return -EAGAIN;
 			else
 				return 0;
 		}
-		inode_lock_shared(inode);
+		inode_lock_shared(&inode->vfs_inode);
 	} else {
 		if (ilock_flags & BTRFS_ILOCK_TRY) {
-			if (!inode_trylock(inode))
+			if (!inode_trylock(&inode->vfs_inode))
 				return -EAGAIN;
 			else
 				return 0;
 		}
-		inode_lock(inode);
+		inode_lock(&inode->vfs_inode);
 	}
 	if (ilock_flags & BTRFS_ILOCK_MMAP)
-		down_write(&BTRFS_I(inode)->i_mmap_lock);
+		down_write(&inode->i_mmap_lock);
 	return 0;
 }
 
@@ -10529,7 +10529,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
 
 	file_accessed(iocb->ki_filp);
 
-	btrfs_inode_lock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+	btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
 
 	if (iocb->ki_pos >= inode->vfs_inode.i_size) {
 		btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 6f9f8e8dfea6..a912e1cb283d 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2528,7 +2528,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 		goto out_dput;
 	}
 
-	btrfs_inode_lock(inode, 0);
+	btrfs_inode_lock(BTRFS_I(inode), 0);
 	err = btrfs_delete_subvolume(dir, dentry);
 	btrfs_inode_unlock(inode, 0);
 	if (!err)
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 9d728107536e..c62c7fdd55d9 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -893,7 +893,7 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
 		return -EINVAL;
 
 	if (same_inode) {
-		btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP);
+		btrfs_inode_lock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP);
 	} else {
 		lock_two_nondirectories(src_inode, dst_inode);
 		btrfs_double_mmap_lock(src_inode, dst_inode);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 2ecca24e1001..f34cb92f3151 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2874,7 +2874,7 @@ static noinline_for_stack int prealloc_file_extent_cluster(
 	if (ret)
 		return ret;
 
-	btrfs_inode_lock(&inode->vfs_inode, 0);
+	btrfs_inode_lock(inode, 0);
 	for (nr = 0; nr < cluster->nr; nr++) {
 		struct extent_state *cached_state = NULL;
 
-- 
cgit 


From e5d4d75bd3241d0a5990060ef9601bf17adf9bb5 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 02:41:32 +0200
Subject: btrfs: pass btrfs_inode to btrfs_inode_unlock

The function is for internal interfaces so we should use the
btrfs_inode.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h   |  2 +-
 fs/btrfs/defrag.c        |  8 ++++----
 fs/btrfs/delayed-inode.c |  2 +-
 fs/btrfs/file.c          | 32 ++++++++++++++++----------------
 fs/btrfs/inode.c         | 18 +++++++++---------
 fs/btrfs/ioctl.c         |  6 +++---
 fs/btrfs/reflink.c       |  2 +-
 fs/btrfs/relocation.c    |  2 +-
 8 files changed, 36 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index a06d1c0a0cc2..279bbbc59dc6 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -546,7 +546,7 @@ enum btrfs_ilock_type {
 };
 
 int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags);
-void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags);
+void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags);
 void btrfs_update_inode_bytes(struct btrfs_inode *inode, const u64 add_bytes,
 			      const u64 del_bytes);
 void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end);
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index 6aade4838927..0a3c261b69c9 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -1298,11 +1298,11 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
 		btrfs_inode_lock(BTRFS_I(inode), 0);
 		if (IS_SWAPFILE(inode)) {
 			ret = -ETXTBSY;
-			btrfs_inode_unlock(inode, 0);
+			btrfs_inode_unlock(BTRFS_I(inode), 0);
 			break;
 		}
 		if (!(inode->i_sb->s_flags & SB_ACTIVE)) {
-			btrfs_inode_unlock(inode, 0);
+			btrfs_inode_unlock(BTRFS_I(inode), 0);
 			break;
 		}
 		if (do_compress)
@@ -1315,7 +1315,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
 		if (sectors_defragged > prev_sectors_defragged)
 			balance_dirty_pages_ratelimited(inode->i_mapping);
 
-		btrfs_inode_unlock(inode, 0);
+		btrfs_inode_unlock(BTRFS_I(inode), 0);
 		if (ret < 0)
 			break;
 		cur = max(cluster_end + 1, last_scanned);
@@ -1353,7 +1353,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
 	if (do_compress) {
 		btrfs_inode_lock(BTRFS_I(inode), 0);
 		BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
-		btrfs_inode_unlock(inode, 0);
+		btrfs_inode_unlock(BTRFS_I(inode), 0);
 	}
 	return ret;
 }
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 4edf44d8cd9e..0095c6e4c3d1 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1646,7 +1646,7 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode,
 	 * We can only do one readdir with delayed items at a time because of
 	 * item->readdir_list.
 	 */
-	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+	btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
 	btrfs_inode_lock(BTRFS_I(inode), 0);
 
 	mutex_lock(&delayed_node->mutex);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b3b7b276cce0..e6c93be91a06 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1428,7 +1428,7 @@ again:
 		iocb->ki_pos += num_written;
 	}
 out:
-	btrfs_inode_unlock(inode, ilock_flags);
+	btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
 	return num_written ? num_written : ret;
 }
 
@@ -1474,13 +1474,13 @@ relock:
 
 	err = generic_write_checks(iocb, from);
 	if (err <= 0) {
-		btrfs_inode_unlock(inode, ilock_flags);
+		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
 		return err;
 	}
 
 	err = btrfs_write_check(iocb, from, err);
 	if (err < 0) {
-		btrfs_inode_unlock(inode, ilock_flags);
+		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
 		goto out;
 	}
 
@@ -1491,13 +1491,13 @@ relock:
 	 */
 	if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
 	    pos + iov_iter_count(from) > i_size_read(inode)) {
-		btrfs_inode_unlock(inode, ilock_flags);
+		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
 		ilock_flags &= ~BTRFS_ILOCK_SHARED;
 		goto relock;
 	}
 
 	if (check_direct_IO(fs_info, from, pos)) {
-		btrfs_inode_unlock(inode, ilock_flags);
+		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
 		goto buffered;
 	}
 
@@ -1528,7 +1528,7 @@ relock:
 	 * iocb, and that needs to lock the inode. So unlock it before calling
 	 * iomap_dio_complete() to avoid a deadlock.
 	 */
-	btrfs_inode_unlock(inode, ilock_flags);
+	btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
 
 	if (IS_ERR_OR_NULL(dio))
 		err = PTR_ERR_OR_ZERO(dio);
@@ -1635,7 +1635,7 @@ static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
 
 	ret = btrfs_do_encoded_write(iocb, from, encoded);
 out:
-	btrfs_inode_unlock(inode, 0);
+	btrfs_inode_unlock(BTRFS_I(inode), 0);
 	return ret;
 }
 
@@ -1830,7 +1830,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	 */
 	ret = start_ordered_ops(inode, start, end);
 	if (ret) {
-		btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+		btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
 		goto out;
 	}
 
@@ -1933,7 +1933,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	 * file again, but that will end up using the synchronization
 	 * inside btrfs_sync_log to keep things safe.
 	 */
-	btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+	btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
 
 	if (ret == BTRFS_NO_LOG_SYNC) {
 		ret = btrfs_end_transaction(trans);
@@ -2001,7 +2001,7 @@ out:
 
 out_release_extents:
 	btrfs_release_log_ctx_extents(&ctx);
-	btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+	btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
 	goto out;
 }
 
@@ -2644,7 +2644,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
 		truncated_block = true;
 		ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
 		if (ret) {
-			btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+			btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
 			return ret;
 		}
 	}
@@ -2743,7 +2743,7 @@ out_only_mutex:
 				ret = ret2;
 		}
 	}
-	btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+	btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
 	return ret;
 }
 
@@ -3104,7 +3104,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 
 	if (mode & FALLOC_FL_ZERO_RANGE) {
 		ret = btrfs_zero_range(inode, offset, len, mode);
-		btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+		btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
 		return ret;
 	}
 
@@ -3202,7 +3202,7 @@ out_unlock:
 	unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
 		      &cached_state);
 out:
-	btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+	btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
 	extent_changeset_free(data_reserved);
 	return ret;
 }
@@ -3693,7 +3693,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
 	case SEEK_HOLE:
 		btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
 		offset = find_desired_extent(BTRFS_I(inode), offset, whence);
-		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+		btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
 		break;
 	}
 
@@ -3797,7 +3797,7 @@ again:
 			goto again;
 		}
 	}
-	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+	btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
 	return ret < 0 ? ret : read;
 }
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9b0f84c85bef..60cc23997216 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -202,14 +202,14 @@ int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags)
  * ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
  * to decide whether the lock acquired is shared or exclusive.
  */
-void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags)
+void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags)
 {
 	if (ilock_flags & BTRFS_ILOCK_MMAP)
-		up_write(&BTRFS_I(inode)->i_mmap_lock);
+		up_write(&inode->i_mmap_lock);
 	if (ilock_flags & BTRFS_ILOCK_SHARED)
-		inode_unlock_shared(inode);
+		inode_unlock_shared(&inode->vfs_inode);
 	else
-		inode_unlock(inode);
+		inode_unlock(&inode->vfs_inode);
 }
 
 /*
@@ -10277,7 +10277,7 @@ static ssize_t btrfs_encoded_read_inline(
 	read_extent_buffer(leaf, tmp, ptr, count);
 	btrfs_release_path(path);
 	unlock_extent(io_tree, start, lockend, cached_state);
-	btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
 	*unlocked = true;
 
 	ret = copy_to_iter(tmp, count, iter);
@@ -10480,7 +10480,7 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
 		goto out;
 
 	unlock_extent(io_tree, start, lockend, cached_state);
-	btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
 	*unlocked = true;
 
 	if (compressed) {
@@ -10532,7 +10532,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
 	btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
 
 	if (iocb->ki_pos >= inode->vfs_inode.i_size) {
-		btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
 		return 0;
 	}
 	start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize);
@@ -10630,7 +10630,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
 
 	if (disk_bytenr == EXTENT_MAP_HOLE) {
 		unlock_extent(io_tree, start, lockend, &cached_state);
-		btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
 		unlocked = true;
 		ret = iov_iter_zero(count, iter);
 		if (ret != count)
@@ -10653,7 +10653,7 @@ out_unlock_extent:
 		unlock_extent(io_tree, start, lockend, &cached_state);
 out_unlock_inode:
 	if (!unlocked)
-		btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
 	return ret;
 }
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a912e1cb283d..1832d3011d12 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1002,7 +1002,7 @@ out_up_read:
 out_dput:
 	dput(dentry);
 out_unlock:
-	btrfs_inode_unlock(dir, 0);
+	btrfs_inode_unlock(BTRFS_I(dir), 0);
 	return error;
 }
 
@@ -2530,14 +2530,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 
 	btrfs_inode_lock(BTRFS_I(inode), 0);
 	err = btrfs_delete_subvolume(dir, dentry);
-	btrfs_inode_unlock(inode, 0);
+	btrfs_inode_unlock(BTRFS_I(inode), 0);
 	if (!err)
 		d_delete_notify(dir, dentry);
 
 out_dput:
 	dput(dentry);
 out_unlock_dir:
-	btrfs_inode_unlock(dir, 0);
+	btrfs_inode_unlock(BTRFS_I(dir), 0);
 free_subvol_name:
 	kfree(subvol_name_ptr);
 free_parent:
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index c62c7fdd55d9..0474bbe39da7 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -911,7 +911,7 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
 
 out_unlock:
 	if (same_inode) {
-		btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP);
+		btrfs_inode_unlock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP);
 	} else {
 		btrfs_double_mmap_unlock(src_inode, dst_inode);
 		unlock_two_nondirectories(src_inode, dst_inode);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index f34cb92f3151..d75b18e84285 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2894,7 +2894,7 @@ static noinline_for_stack int prealloc_file_extent_cluster(
 		if (ret)
 			break;
 	}
-	btrfs_inode_unlock(&inode->vfs_inode, 0);
+	btrfs_inode_unlock(inode, 0);
 
 	if (cur_offset < prealloc_end)
 		btrfs_free_reserved_data_space_noquota(inode->root->fs_info,
-- 
cgit 


From 7152b425da54b6aa6ddfb0cbc0603614cc2400bd Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 02:41:32 +0200
Subject: btrfs: pass btrfs_inode to btrfs_dirty_inode

The function is for internal interfaces so we should use the
btrfs_inode.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 60cc23997216..dc06511c359a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -286,7 +286,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
 	return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false);
 }
 
-static int btrfs_dirty_inode(struct inode *inode);
+static int btrfs_dirty_inode(struct btrfs_inode *inode);
 
 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
 				     struct btrfs_new_inode_args *args)
@@ -5313,7 +5313,7 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr
 	if (attr->ia_valid) {
 		setattr_copy(mnt_userns, inode, attr);
 		inode_inc_iversion(inode);
-		err = btrfs_dirty_inode(inode);
+		err = btrfs_dirty_inode(BTRFS_I(inode));
 
 		if (!err && attr->ia_valid & ATTR_MODE)
 			err = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
@@ -6144,21 +6144,21 @@ err:
  * FIXME, needs more benchmarking...there are no reasons other than performance
  * to keep or drop this code.
  */
-static int btrfs_dirty_inode(struct inode *inode)
+static int btrfs_dirty_inode(struct btrfs_inode *inode)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_root *root = inode->root;
+	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_trans_handle *trans;
 	int ret;
 
-	if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
+	if (test_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags))
 		return 0;
 
 	trans = btrfs_join_transaction(root);
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
 
-	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+	ret = btrfs_update_inode(trans, root, inode);
 	if (ret && (ret == -ENOSPC || ret == -EDQUOT)) {
 		/* whoops, lets try again with the full transaction */
 		btrfs_end_transaction(trans);
@@ -6166,10 +6166,10 @@ static int btrfs_dirty_inode(struct inode *inode)
 		if (IS_ERR(trans))
 			return PTR_ERR(trans);
 
-		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+		ret = btrfs_update_inode(trans, root, inode);
 	}
 	btrfs_end_transaction(trans);
-	if (BTRFS_I(inode)->delayed_node)
+	if (inode->delayed_node)
 		btrfs_balance_delayed_items(fs_info);
 
 	return ret;
@@ -6196,7 +6196,7 @@ static int btrfs_update_time(struct inode *inode, struct timespec64 *now,
 		inode->i_mtime = *now;
 	if (flags & S_ATIME)
 		inode->i_atime = *now;
-	return dirty ? btrfs_dirty_inode(inode) : 0;
+	return dirty ? btrfs_dirty_inode(BTRFS_I(inode)) : 0;
 }
 
 /*
-- 
cgit 


From 82ca5a04f03fa1e74a5034ec61ac86d9c817a2b9 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 02:41:32 +0200
Subject: btrfs: pass btrfs_inode to btrfs_add_delalloc_inodes

The function is for internal interfaces so we should use the
btrfs_inode.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index dc06511c359a..d9d7e11d8d9a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2369,16 +2369,14 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
 }
 
 static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
-				      struct inode *inode)
+				      struct btrfs_inode *inode)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 
 	spin_lock(&root->delalloc_lock);
-	if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
-		list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
-			      &root->delalloc_inodes);
-		set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-			&BTRFS_I(inode)->runtime_flags);
+	if (list_empty(&inode->delalloc_inodes)) {
+		list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
+		set_bit(BTRFS_INODE_IN_DELALLOC_LIST, &inode->runtime_flags);
 		root->nr_delalloc_inodes++;
 		if (root->nr_delalloc_inodes == 1) {
 			spin_lock(&fs_info->delalloc_root_lock);
@@ -2391,7 +2389,6 @@ static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
 	spin_unlock(&root->delalloc_lock);
 }
 
-
 void __btrfs_del_delalloc_inode(struct btrfs_root *root,
 				struct btrfs_inode *inode)
 {
@@ -2458,7 +2455,7 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
 			BTRFS_I(inode)->defrag_bytes += len;
 		if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
 					 &BTRFS_I(inode)->runtime_flags))
-			btrfs_add_delalloc_inodes(root, inode);
+			btrfs_add_delalloc_inodes(root, BTRFS_I(inode));
 		spin_unlock(&BTRFS_I(inode)->lock);
 	}
 
-- 
cgit 


From 36eeaef5595dc09f1dc4c859d96ffb3cd69dee55 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 28 Oct 2022 02:22:15 +0200
Subject: btrfs: switch btrfs_writepage_fixup::inode to btrfs_inode

The btrfs_writepage_fixup structure is for internal interfaces so we
should use the btrfs_inode.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d9d7e11d8d9a..06ea43460e90 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2890,7 +2890,7 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
 /* see btrfs_writepage_start_hook for details on why this is required */
 struct btrfs_writepage_fixup {
 	struct page *page;
-	struct inode *inode;
+	struct btrfs_inode *inode;
 	struct btrfs_work work;
 };
 
@@ -2909,7 +2909,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
 
 	fixup = container_of(work, struct btrfs_writepage_fixup, work);
 	page = fixup->page;
-	inode = BTRFS_I(fixup->inode);
+	inode = fixup->inode;
 	page_start = page_offset(page);
 	page_end = page_offset(page) + PAGE_SIZE - 1;
 
@@ -3068,7 +3068,7 @@ int btrfs_writepage_cow_fixup(struct page *page)
 	get_page(page);
 	btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
 	fixup->page = page;
-	fixup->inode = inode;
+	fixup->inode = BTRFS_I(inode);
 	btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
 
 	return -EAGAIN;
-- 
cgit 


From 621af94af3342c9a8c3df34b4231d7707accd00e Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 02:41:32 +0200
Subject: btrfs: pass btrfs_inode to btrfs_check_data_csum

The function is for internal interfaces so we should use the
btrfs_inode.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h |  4 +---
 fs/btrfs/compression.c |  2 +-
 fs/btrfs/inode.c       | 15 +++++++--------
 3 files changed, 9 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 279bbbc59dc6..ba0dbdc91ec5 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -421,13 +421,11 @@ blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode,
 					      u64 dio_file_offset);
 int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
 			    u32 pgoff, u8 *csum, const u8 * const csum_expected);
-int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
+int btrfs_check_data_csum(struct btrfs_inode *inode, struct btrfs_bio *bbio,
 			  u32 bio_offset, struct page *page, u32 pgoff);
 unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
 				    u32 bio_offset, struct page *page,
 				    u64 start, u64 end);
-int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
-			  u32 bio_offset, struct page *page, u32 pgoff);
 noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
 			      u64 *orig_start, u64 *orig_block_len,
 			      u64 *ram_bytes, bool nowait, bool strict);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 6f5ad0d6c409..42e6dde2ad59 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -186,7 +186,7 @@ static void end_compressed_bio_read(struct btrfs_bio *bbio)
 		u64 start = bbio->file_offset + offset;
 
 		if (!status &&
-		    (!csum || !btrfs_check_data_csum(inode, bbio, offset,
+		    (!csum || !btrfs_check_data_csum(bi, bbio, offset,
 						     bv.bv_page, bv.bv_offset))) {
 			btrfs_clean_io_failure(bi, start, bv.bv_page,
 					       bv.bv_offset);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 06ea43460e90..76e2bea2270a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3490,10 +3490,10 @@ static u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, u64 of
  * When csum mismatch is detected, we will also report the error and fill the
  * corrupted range with zero. (Thus it needs the extra parameters)
  */
-int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
+int btrfs_check_data_csum(struct btrfs_inode *inode, struct btrfs_bio *bbio,
 			  u32 bio_offset, struct page *page, u32 pgoff)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	u32 len = fs_info->sectorsize;
 	u8 *csum_expected;
 	u8 csum[BTRFS_CSUM_SIZE];
@@ -3507,8 +3507,7 @@ int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
 	return 0;
 
 zeroit:
-	btrfs_print_data_csum_error(BTRFS_I(inode),
-				    bbio->file_offset + bio_offset,
+	btrfs_print_data_csum_error(inode, bbio->file_offset + bio_offset,
 				    csum, csum_expected, bbio->mirror_num);
 	if (bbio->device)
 		btrfs_dev_stat_inc_and_print(bbio->device,
@@ -3573,7 +3572,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
 					  EXTENT_NODATASUM);
 			continue;
 		}
-		ret = btrfs_check_data_csum(inode, bbio, bio_offset, page, pg_off);
+		ret = btrfs_check_data_csum(BTRFS_I(inode), bbio, bio_offset, page, pg_off);
 		if (ret < 0) {
 			const int nr_bit = (pg_off - offset_in_page(start)) >>
 				     root->fs_info->sectorsize_bits;
@@ -7943,8 +7942,8 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
 		u64 start = bbio->file_offset + offset;
 
 		if (uptodate &&
-		    (!csum || !btrfs_check_data_csum(inode, bbio, offset, bv.bv_page,
-					       bv.bv_offset))) {
+		    (!csum || !btrfs_check_data_csum(BTRFS_I(inode), bbio, offset,
+						     bv.bv_page, bv.bv_offset))) {
 			btrfs_clean_io_failure(BTRFS_I(inode), start,
 					       bv.bv_page, bv.bv_offset);
 		} else {
@@ -10334,7 +10333,7 @@ static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
 		pgoff = bvec->bv_offset;
 		for (i = 0; i < nr_sectors; i++) {
 			ASSERT(pgoff < PAGE_SIZE);
-			if (btrfs_check_data_csum(&inode->vfs_inode, bbio, bio_offset,
+			if (btrfs_check_data_csum(inode, bbio, bio_offset,
 					    bvec->bv_page, pgoff))
 				return BLK_STS_IOERR;
 			bio_offset += sectorsize;
-- 
cgit 


From e569b1d545511492d5fc1a062ef8b63fd1b78d84 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 02:41:32 +0200
Subject: btrfs: pass btrfs_inode to __unlink_start_trans

The function is for internal interfaces so we should use the
btrfs_inode.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 76e2bea2270a..ffb42560acd5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4415,9 +4415,9 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
  * plenty of slack room in the global reserve to migrate, otherwise we cannot
  * allow the unlink to occur.
  */
-static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
+static struct btrfs_trans_handle *__unlink_start_trans(struct btrfs_inode *dir)
 {
-	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct btrfs_root *root = dir->root;
 
 	/*
 	 * 1 for the possible orphan item
@@ -4443,7 +4443,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 
 	/* This needs to handle no-key deletions later on */
 
-	trans = __unlink_start_trans(dir);
+	trans = __unlink_start_trans(BTRFS_I(dir));
 	if (IS_ERR(trans)) {
 		ret = PTR_ERR(trans);
 		goto fscrypt_free;
@@ -4857,7 +4857,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 
 	/* This needs to handle no-key deletions later on */
 
-	trans = __unlink_start_trans(dir);
+	trans = __unlink_start_trans(BTRFS_I(dir));
 	if (IS_ERR(trans)) {
 		err = PTR_ERR(trans);
 		goto out_notrans;
-- 
cgit 


From 3c4f91e23a87a486426db5e481ed315b1b2640f1 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 02:41:32 +0200
Subject: btrfs: pass btrfs_inode to btrfs_delete_subvolume

The function is for internal interfaces so we should use the
btrfs_inode.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h |  2 +-
 fs/btrfs/inode.c       | 10 +++++-----
 fs/btrfs/ioctl.c       |  2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index ba0dbdc91ec5..481c75c47fc4 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -439,7 +439,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 int btrfs_add_link(struct btrfs_trans_handle *trans,
 		   struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
 		   const struct fscrypt_str *name, int add_backref, u64 index);
-int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry);
+int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry);
 int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
 			 int front);
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ffb42560acd5..5e2fd24b6951 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4698,10 +4698,10 @@ again:
 	spin_unlock(&root->inode_lock);
 }
 
-int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
+int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
-	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct btrfs_root *root = dir->root;
 	struct inode *inode = d_inode(dentry);
 	struct btrfs_root *dest = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
@@ -4758,9 +4758,9 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
 	trans->block_rsv = &block_rsv;
 	trans->bytes_reserved = block_rsv.size;
 
-	btrfs_record_snapshot_destroy(trans, BTRFS_I(dir));
+	btrfs_record_snapshot_destroy(trans, dir);
 
-	ret = btrfs_unlink_subvol(trans, dir, dentry);
+	ret = btrfs_unlink_subvol(trans, &dir->vfs_inode, dentry);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		goto out_end_trans;
@@ -4848,7 +4848,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 			"extent tree v2 doesn't support snapshot deletion yet");
 			return -EOPNOTSUPP;
 		}
-		return btrfs_delete_subvolume(dir, dentry);
+		return btrfs_delete_subvolume(BTRFS_I(dir), dentry);
 	}
 
 	err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 1832d3011d12..7a9e697d9931 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2529,7 +2529,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 	}
 
 	btrfs_inode_lock(BTRFS_I(inode), 0);
-	err = btrfs_delete_subvolume(dir, dentry);
+	err = btrfs_delete_subvolume(BTRFS_I(dir), dentry);
 	btrfs_inode_unlock(BTRFS_I(inode), 0);
 	if (!err)
 		d_delete_notify(dir, dentry);
-- 
cgit 


From 35da5a7edec32935135737608c9d9da24a419bab Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 28 Oct 2022 02:47:06 +0200
Subject: btrfs: drop private_data parameter from extent_io_tree_init

All callers except one pass NULL, so the parameter can be dropped and
the inode::io_tree initialization can be open coded.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c               | 8 ++++----
 fs/btrfs/extent-io-tree.c        | 5 ++---
 fs/btrfs/extent-io-tree.h        | 3 +--
 fs/btrfs/inode.c                 | 5 +++--
 fs/btrfs/relocation.c            | 3 +--
 fs/btrfs/tests/btrfs-tests.c     | 2 +-
 fs/btrfs/tests/extent-io-tests.c | 4 ++--
 fs/btrfs/transaction.c           | 4 ++--
 fs/btrfs/volumes.c               | 3 +--
 9 files changed, 17 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 72e3446d9840..e602e0b4c25d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1044,9 +1044,9 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
 	root->anon_dev = 0;
 	if (!dummy) {
 		extent_io_tree_init(fs_info, &root->dirty_log_pages,
-				    IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL);
+				    IO_TREE_ROOT_DIRTY_LOG_PAGES);
 		extent_io_tree_init(fs_info, &root->log_csum_range,
-				    IO_TREE_LOG_CSUM_RANGE, NULL);
+				    IO_TREE_LOG_CSUM_RANGE);
 	}
 
 	spin_lock_init(&root->root_item_lock);
@@ -2250,7 +2250,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
 
 	RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
 	extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
-			    IO_TREE_BTREE_INODE_IO, NULL);
+			    IO_TREE_BTREE_INODE_IO);
 	extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
 
 	BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
@@ -3074,7 +3074,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
 	fs_info->block_group_cache_tree = RB_ROOT_CACHED;
 
 	extent_io_tree_init(fs_info, &fs_info->excluded_extents,
-			    IO_TREE_FS_EXCLUDED_EXTENTS, NULL);
+			    IO_TREE_FS_EXCLUDED_EXTENTS);
 
 	mutex_init(&fs_info->ordered_operations_mutex);
 	mutex_init(&fs_info->tree_log_mutex);
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index 2cdff74ff033..81365870576f 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -94,13 +94,12 @@ struct tree_entry {
 };
 
 void extent_io_tree_init(struct btrfs_fs_info *fs_info,
-			 struct extent_io_tree *tree, unsigned int owner,
-			 void *private_data)
+			 struct extent_io_tree *tree, unsigned int owner)
 {
 	tree->fs_info = fs_info;
 	tree->state = RB_ROOT;
 	spin_lock_init(&tree->lock);
-	tree->private_data = private_data;
+	tree->private_data = NULL;
 	tree->owner = owner;
 	if (owner == IO_TREE_INODE_FILE_EXTENT)
 		lockdep_set_class(&tree->lock, &file_extent_tree_class);
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index d73ef24bad2e..0e16642c28a3 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -104,8 +104,7 @@ struct extent_state {
 };
 
 void extent_io_tree_init(struct btrfs_fs_info *fs_info,
-			 struct extent_io_tree *tree, unsigned int owner,
-			 void *private_data);
+			 struct extent_io_tree *tree, unsigned int owner);
 void extent_io_tree_release(struct extent_io_tree *tree);
 
 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5e2fd24b6951..29c6c0ec0016 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8871,9 +8871,10 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 
 	inode = &ei->vfs_inode;
 	extent_map_tree_init(&ei->extent_tree);
-	extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode);
+	extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
+	ei->io_tree.private_data = inode;
 	extent_io_tree_init(fs_info, &ei->file_extent_tree,
-			    IO_TREE_INODE_FILE_EXTENT, NULL);
+			    IO_TREE_INODE_FILE_EXTENT);
 	ei->io_failure_tree = RB_ROOT;
 	atomic_set(&ei->sync_writers, 0);
 	mutex_init(&ei->log_mutex);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index d75b18e84285..1440cb332a5a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3928,8 +3928,7 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
 	INIT_LIST_HEAD(&rc->dirty_subvol_roots);
 	btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1);
 	mapping_tree_init(&rc->reloc_root_tree);
-	extent_io_tree_init(fs_info, &rc->processed_blocks,
-			    IO_TREE_RELOC_BLOCKS, NULL);
+	extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS);
 	return rc;
 }
 
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 669fa7133a2f..181469fc0bb3 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -102,7 +102,7 @@ struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info)
 	if (!dev)
 		return ERR_PTR(-ENOMEM);
 
-	extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL);
+	extent_io_tree_init(NULL, &dev->alloc_state, 0);
 	INIT_LIST_HEAD(&dev->dev_list);
 	list_add(&dev->dev_list, &fs_info->fs_devices->devices);
 
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 350da449db08..dfc5c7fa6038 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -132,7 +132,7 @@ static int test_find_delalloc(u32 sectorsize)
 	 * Passing NULL as we don't have fs_info but tracepoints are not used
 	 * at this point
 	 */
-	extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST, NULL);
+	extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST);
 
 	/*
 	 * First go through and create and mark all of our pages dirty, we pin
@@ -489,7 +489,7 @@ static int test_find_first_clear_extent_bit(void)
 
 	test_msg("running find_first_clear_extent_bit test");
 
-	extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST, NULL);
+	extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST);
 
 	/* Test correct handling of empty tree */
 	find_first_clear_extent_bit(&tree, 0, &start, &end, CHUNK_TRIMMED);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2e2dd2ea109b..b8c52e89688c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -378,9 +378,9 @@ loop:
 	spin_lock_init(&cur_trans->releasing_ebs_lock);
 	list_add_tail(&cur_trans->list, &fs_info->trans_list);
 	extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
-			IO_TREE_TRANS_DIRTY_PAGES, NULL);
+			IO_TREE_TRANS_DIRTY_PAGES);
 	extent_io_tree_init(fs_info, &cur_trans->pinned_extents,
-			IO_TREE_FS_PINNED_EXTENTS, NULL);
+			IO_TREE_FS_PINNED_EXTENTS);
 	fs_info->generation++;
 	cur_trans->transid = fs_info->generation;
 	fs_info->running_transaction = cur_trans;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 7751ab620761..acb97494bb52 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -7048,8 +7048,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
 
 	atomic_set(&dev->dev_stats_ccnt, 0);
 	btrfs_device_data_ordered_init(dev);
-	extent_io_tree_init(fs_info, &dev->alloc_state,
-			    IO_TREE_DEVICE_ALLOC_STATE, NULL);
+	extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE);
 
 	if (devid)
 		tmp = *devid;
-- 
cgit 


From 0988fc7bda79feff046056f032fd149ab08e03d1 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 28 Oct 2022 02:55:51 +0200
Subject: btrfs: switch extent_io_tree::private_data to btrfs_inode and rename

The extent_io_tree::private_data was meant to be a preparatory work for
the metadata inode rework but that never materialized. Now it's used
only for an inode so it's better to change the appropriate type and
rename it.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-io-tree.c | 32 ++++++++++++++++----------------
 fs/btrfs/extent-io-tree.h |  3 ++-
 fs/btrfs/inode.c          |  2 +-
 3 files changed, 19 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index 81365870576f..bbcc65593d1d 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -58,17 +58,17 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
 						       struct extent_io_tree *tree,
 						       u64 start, u64 end)
 {
-	struct inode *inode = tree->private_data;
+	struct btrfs_inode *inode = tree->inode;
 	u64 isize;
 
 	if (!inode)
 		return;
 
-	isize = i_size_read(inode);
+	isize = i_size_read(&inode->vfs_inode);
 	if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
-		btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
+		btrfs_debug_rl(inode->root->fs_info,
 		    "%s: ino %llu isize %llu odd range [%llu,%llu]",
-			caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
+			caller, btrfs_ino(inode), isize, start, end);
 	}
 }
 #else
@@ -99,7 +99,7 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info,
 	tree->fs_info = fs_info;
 	tree->state = RB_ROOT;
 	spin_lock_init(&tree->lock);
-	tree->private_data = NULL;
+	tree->inode = NULL;
 	tree->owner = owner;
 	if (owner == IO_TREE_INODE_FILE_EXTENT)
 		lockdep_set_class(&tree->lock, &file_extent_tree_class);
@@ -346,9 +346,9 @@ static void merge_state(struct extent_io_tree *tree, struct extent_state *state)
 	other = prev_state(state);
 	if (other && other->end == state->start - 1 &&
 	    other->state == state->state) {
-		if (tree->private_data)
-			btrfs_merge_delalloc_extent(tree->private_data,
-						    state, other);
+		if (tree->inode)
+			btrfs_merge_delalloc_extent(&tree->inode->vfs_inode, state,
+						    other);
 		state->start = other->start;
 		rb_erase(&other->rb_node, &tree->state);
 		RB_CLEAR_NODE(&other->rb_node);
@@ -357,8 +357,8 @@ static void merge_state(struct extent_io_tree *tree, struct extent_state *state)
 	other = next_state(state);
 	if (other && other->start == state->end + 1 &&
 	    other->state == state->state) {
-		if (tree->private_data)
-			btrfs_merge_delalloc_extent(tree->private_data, state,
+		if (tree->inode)
+			btrfs_merge_delalloc_extent(&tree->inode->vfs_inode, state,
 						    other);
 		state->end = other->end;
 		rb_erase(&other->rb_node, &tree->state);
@@ -374,8 +374,8 @@ static void set_state_bits(struct extent_io_tree *tree,
 	u32 bits_to_set = bits & ~EXTENT_CTLBITS;
 	int ret;
 
-	if (tree->private_data)
-		btrfs_set_delalloc_extent(tree->private_data, state, bits);
+	if (tree->inode)
+		btrfs_set_delalloc_extent(&tree->inode->vfs_inode, state, bits);
 
 	ret = add_extent_changeset(state, bits_to_set, changeset, 1);
 	BUG_ON(ret < 0);
@@ -462,8 +462,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 	struct rb_node *parent = NULL;
 	struct rb_node **node;
 
-	if (tree->private_data)
-		btrfs_split_delalloc_extent(tree->private_data, orig, split);
+	if (tree->inode)
+		btrfs_split_delalloc_extent(&tree->inode->vfs_inode, orig, split);
 
 	prealloc->start = orig->start;
 	prealloc->end = split - 1;
@@ -510,8 +510,8 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
 	u32 bits_to_clear = bits & ~EXTENT_CTLBITS;
 	int ret;
 
-	if (tree->private_data)
-		btrfs_clear_delalloc_extent(tree->private_data, state, bits);
+	if (tree->inode)
+		btrfs_clear_delalloc_extent(&tree->inode->vfs_inode, state, bits);
 
 	ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
 	BUG_ON(ret < 0);
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index 0e16642c28a3..cdee8c0854c8 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -80,7 +80,8 @@ enum {
 struct extent_io_tree {
 	struct rb_root state;
 	struct btrfs_fs_info *fs_info;
-	void *private_data;
+	/* Inode associated with this tree, or NULL. */
+	struct btrfs_inode *inode;
 
 	/* Who owns this io tree, should be one of IO_TREE_* */
 	u8 owner;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 29c6c0ec0016..23ec33ac86e0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8872,7 +8872,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	inode = &ei->vfs_inode;
 	extent_map_tree_init(&ei->extent_tree);
 	extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
-	ei->io_tree.private_data = inode;
+	ei->io_tree.inode = ei;
 	extent_io_tree_init(fs_info, &ei->file_extent_tree,
 			    IO_TREE_INODE_FILE_EXTENT);
 	ei->io_failure_tree = RB_ROOT;
-- 
cgit 


From 2454151cdede9f6f3a2213ae84b07d9a9edb485e Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 02:41:32 +0200
Subject: btrfs: pass btrfs_inode to btrfs_merge_delalloc_extent

The function is for internal interfaces so we should use the
btrfs_inode.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h    |  2 +-
 fs/btrfs/extent-io-tree.c |  6 ++----
 fs/btrfs/inode.c          | 16 ++++++++--------
 3 files changed, 11 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 481c75c47fc4..130c95c6f7df 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -475,7 +475,7 @@ struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns,
 			        u32 bits);
 void btrfs_clear_delalloc_extent(struct inode *inode,
 				 struct extent_state *state, u32 bits);
-void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
+void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new,
 				 struct extent_state *other);
 void btrfs_split_delalloc_extent(struct inode *inode,
 				 struct extent_state *orig, u64 split);
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index bbcc65593d1d..942212e1dbaf 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -347,8 +347,7 @@ static void merge_state(struct extent_io_tree *tree, struct extent_state *state)
 	if (other && other->end == state->start - 1 &&
 	    other->state == state->state) {
 		if (tree->inode)
-			btrfs_merge_delalloc_extent(&tree->inode->vfs_inode, state,
-						    other);
+			btrfs_merge_delalloc_extent(tree->inode, state, other);
 		state->start = other->start;
 		rb_erase(&other->rb_node, &tree->state);
 		RB_CLEAR_NODE(&other->rb_node);
@@ -358,8 +357,7 @@ static void merge_state(struct extent_io_tree *tree, struct extent_state *state)
 	if (other && other->start == state->end + 1 &&
 	    other->state == state->state) {
 		if (tree->inode)
-			btrfs_merge_delalloc_extent(&tree->inode->vfs_inode, state,
-						    other);
+			btrfs_merge_delalloc_extent(tree->inode, state, other);
 		state->end = other->end;
 		rb_erase(&other->rb_node, &tree->state);
 		RB_CLEAR_NODE(&other->rb_node);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 23ec33ac86e0..a2de63d38b1a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2314,10 +2314,10 @@ void btrfs_split_delalloc_extent(struct inode *inode,
  * that are just merged onto old extents, such as when we are doing sequential
  * writes, so we can properly account for the metadata space we'll need.
  */
-void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
+void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new,
 				 struct extent_state *other)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	u64 new_size, old_size;
 	u32 num_extents;
 
@@ -2332,9 +2332,9 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
 
 	/* we're not bigger than the max, unreserve the space and go */
 	if (new_size <= fs_info->max_extent_size) {
-		spin_lock(&BTRFS_I(inode)->lock);
-		btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
-		spin_unlock(&BTRFS_I(inode)->lock);
+		spin_lock(&inode->lock);
+		btrfs_mod_outstanding_extents(inode, -1);
+		spin_unlock(&inode->lock);
 		return;
 	}
 
@@ -2363,9 +2363,9 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
 	if (count_max_extents(fs_info, new_size) >= num_extents)
 		return;
 
-	spin_lock(&BTRFS_I(inode)->lock);
-	btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
-	spin_unlock(&BTRFS_I(inode)->lock);
+	spin_lock(&inode->lock);
+	btrfs_mod_outstanding_extents(inode, -1);
+	spin_unlock(&inode->lock);
 }
 
 static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
-- 
cgit 


From 4c5d166f6b3674473905ff133daea4de33f00b91 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 02:41:32 +0200
Subject: btrfs: pass btrfs_inode to btrfs_set_delalloc_extent

The function is for internal interfaces so we should use the
btrfs_inode.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h    |  2 +-
 fs/btrfs/extent-io-tree.c |  2 +-
 fs/btrfs/inode.c          | 33 ++++++++++++++++-----------------
 3 files changed, 18 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 130c95c6f7df..8a8280233199 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -471,7 +471,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
 void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args);
 struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns,
 				     struct inode *dir);
- void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
+ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state,
 			        u32 bits);
 void btrfs_clear_delalloc_extent(struct inode *inode,
 				 struct extent_state *state, u32 bits);
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index 942212e1dbaf..1b4c52d7201d 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -373,7 +373,7 @@ static void set_state_bits(struct extent_io_tree *tree,
 	int ret;
 
 	if (tree->inode)
-		btrfs_set_delalloc_extent(&tree->inode->vfs_inode, state, bits);
+		btrfs_set_delalloc_extent(tree->inode, state, bits);
 
 	ret = add_extent_changeset(state, bits_to_set, changeset, 1);
 	BUG_ON(ret < 0);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a2de63d38b1a..fae41b954373 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2421,10 +2421,10 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root,
  * Properly track delayed allocation bytes in the inode and to maintain the
  * list of inodes that have pending delalloc work to be done.
  */
-void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
+void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state,
 			       u32 bits)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 
 	if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
 		WARN_ON(1);
@@ -2434,14 +2434,14 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
 	 * bit, which is only set or cleared with irqs on
 	 */
 	if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
-		struct btrfs_root *root = BTRFS_I(inode)->root;
+		struct btrfs_root *root = inode->root;
 		u64 len = state->end + 1 - state->start;
 		u32 num_extents = count_max_extents(fs_info, len);
-		bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
+		bool do_list = !btrfs_is_free_space_inode(inode);
 
-		spin_lock(&BTRFS_I(inode)->lock);
-		btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents);
-		spin_unlock(&BTRFS_I(inode)->lock);
+		spin_lock(&inode->lock);
+		btrfs_mod_outstanding_extents(inode, num_extents);
+		spin_unlock(&inode->lock);
 
 		/* For sanity tests */
 		if (btrfs_is_testing(fs_info))
@@ -2449,22 +2449,21 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
 
 		percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
 					 fs_info->delalloc_batch);
-		spin_lock(&BTRFS_I(inode)->lock);
-		BTRFS_I(inode)->delalloc_bytes += len;
+		spin_lock(&inode->lock);
+		inode->delalloc_bytes += len;
 		if (bits & EXTENT_DEFRAG)
-			BTRFS_I(inode)->defrag_bytes += len;
+			inode->defrag_bytes += len;
 		if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-					 &BTRFS_I(inode)->runtime_flags))
-			btrfs_add_delalloc_inodes(root, BTRFS_I(inode));
-		spin_unlock(&BTRFS_I(inode)->lock);
+					 &inode->runtime_flags))
+			btrfs_add_delalloc_inodes(root, inode);
+		spin_unlock(&inode->lock);
 	}
 
 	if (!(state->state & EXTENT_DELALLOC_NEW) &&
 	    (bits & EXTENT_DELALLOC_NEW)) {
-		spin_lock(&BTRFS_I(inode)->lock);
-		BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
-			state->start;
-		spin_unlock(&BTRFS_I(inode)->lock);
+		spin_lock(&inode->lock);
+		inode->new_delalloc_bytes += state->end + 1 - state->start;
+		spin_unlock(&inode->lock);
 	}
 }
 
-- 
cgit 


From 62798a491561e1e1bbbd08873561532f19525fbf Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 02:41:32 +0200
Subject: btrfs: pass btrfs_inode to btrfs_split_delalloc_extent

The function is for internal interfaces so we should use the
btrfs_inode.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h    |  2 +-
 fs/btrfs/extent-io-tree.c |  2 +-
 fs/btrfs/inode.c          | 10 +++++-----
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 8a8280233199..ddf1867ba6d5 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -477,7 +477,7 @@ void btrfs_clear_delalloc_extent(struct inode *inode,
 				 struct extent_state *state, u32 bits);
 void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new,
 				 struct extent_state *other);
-void btrfs_split_delalloc_extent(struct inode *inode,
+void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
 				 struct extent_state *orig, u64 split);
 void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
 vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index 1b4c52d7201d..124aede5e492 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -461,7 +461,7 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 	struct rb_node **node;
 
 	if (tree->inode)
-		btrfs_split_delalloc_extent(&tree->inode->vfs_inode, orig, split);
+		btrfs_split_delalloc_extent(tree->inode, orig, split);
 
 	prealloc->start = orig->start;
 	prealloc->end = split - 1;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index fae41b954373..a43d7a063807 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2277,10 +2277,10 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
 	return ret;
 }
 
-void btrfs_split_delalloc_extent(struct inode *inode,
+void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
 				 struct extent_state *orig, u64 split)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	u64 size;
 
 	/* not delalloc, ignore it */
@@ -2304,9 +2304,9 @@ void btrfs_split_delalloc_extent(struct inode *inode,
 			return;
 	}
 
-	spin_lock(&BTRFS_I(inode)->lock);
-	btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
-	spin_unlock(&BTRFS_I(inode)->lock);
+	spin_lock(&inode->lock);
+	btrfs_mod_outstanding_extents(inode, 1);
+	spin_unlock(&inode->lock);
 }
 
 /*
-- 
cgit 


From bd54766e40df1300564babbbc31866d36c0c4bb6 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 02:41:32 +0200
Subject: btrfs: pass btrfs_inode to btrfs_clear_delalloc_extent

The function is for internal interfaces so we should use the
btrfs_inode.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h    | 2 +-
 fs/btrfs/extent-io-tree.c | 2 +-
 fs/btrfs/inode.c          | 5 ++---
 3 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index ddf1867ba6d5..9e31dc8b0285 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -473,7 +473,7 @@ struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns,
 				     struct inode *dir);
  void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state,
 			        u32 bits);
-void btrfs_clear_delalloc_extent(struct inode *inode,
+void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
 				 struct extent_state *state, u32 bits);
 void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new,
 				 struct extent_state *other);
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index 124aede5e492..285b0ff6e953 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -509,7 +509,7 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
 	int ret;
 
 	if (tree->inode)
-		btrfs_clear_delalloc_extent(&tree->inode->vfs_inode, state, bits);
+		btrfs_clear_delalloc_extent(tree->inode, state, bits);
 
 	ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
 	BUG_ON(ret < 0);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a43d7a063807..59ec9b2ef508 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2471,11 +2471,10 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s
  * Once a range is no longer delalloc this function ensures that proper
  * accounting happens.
  */
-void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
+void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
 				 struct extent_state *state, u32 bits)
 {
-	struct btrfs_inode *inode = BTRFS_I(vfs_inode);
-	struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	u64 len = state->end + 1 - state->start;
 	u32 num_extents = count_max_extents(fs_info, len);
 
-- 
cgit 


From 5b7544cb06fef21e4f8c189082a7bf02fd9832a1 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 02:41:32 +0200
Subject: btrfs: pass btrfs_inode to btrfs_unlink_subvol

The function is for internal interfaces so we should use the
btrfs_inode.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 59ec9b2ef508..3577ee142888 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4470,9 +4470,9 @@ fscrypt_free:
 }
 
 static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
-			       struct inode *dir, struct dentry *dentry)
+			       struct btrfs_inode *dir, struct dentry *dentry)
 {
-	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct btrfs_root *root = dir->root;
 	struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
@@ -4481,10 +4481,10 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
 	u64 index;
 	int ret;
 	u64 objectid;
-	u64 dir_ino = btrfs_ino(BTRFS_I(dir));
+	u64 dir_ino = btrfs_ino(dir);
 	struct fscrypt_name fname;
 
-	ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
+	ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
 	if (ret)
 		return ret;
 
@@ -4557,17 +4557,17 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
 		}
 	}
 
-	ret = btrfs_delete_delayed_dir_index(trans, BTRFS_I(dir), index);
+	ret = btrfs_delete_delayed_dir_index(trans, dir, index);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		goto out;
 	}
 
-	btrfs_i_size_write(BTRFS_I(dir), dir->i_size - fname.disk_name.len * 2);
-	inode_inc_iversion(dir);
-	dir->i_mtime = current_time(dir);
-	dir->i_ctime = dir->i_mtime;
-	ret = btrfs_update_inode_fallback(trans, root, BTRFS_I(dir));
+	btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2);
+	inode_inc_iversion(&dir->vfs_inode);
+	dir->vfs_inode.i_mtime = current_time(&dir->vfs_inode);
+	dir->vfs_inode.i_ctime = dir->vfs_inode.i_mtime;
+	ret = btrfs_update_inode_fallback(trans, root, dir);
 	if (ret)
 		btrfs_abort_transaction(trans, ret);
 out:
@@ -4758,7 +4758,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
 
 	btrfs_record_snapshot_destroy(trans, dir);
 
-	ret = btrfs_unlink_subvol(trans, &dir->vfs_inode, dentry);
+	ret = btrfs_unlink_subvol(trans, dir, dentry);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		goto out_end_trans;
@@ -4862,7 +4862,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	}
 
 	if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
-		err = btrfs_unlink_subvol(trans, dir, dentry);
+		err = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry);
 		goto out;
 	}
 
@@ -9208,7 +9208,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 
 	/* src is a subvolume */
 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
-		ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
+		ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
 	} else { /* src is an inode */
 		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
 					   BTRFS_I(old_dentry->d_inode),
@@ -9223,7 +9223,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 
 	/* dest is a subvolume */
 	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
-		ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
+		ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
 	} else { /* dest is an inode */
 		ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
 					   BTRFS_I(new_dentry->d_inode),
@@ -9470,7 +9470,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
 				BTRFS_I(old_inode), 1);
 
 	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
-		ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
+		ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
 	} else {
 		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
 					   BTRFS_I(d_inode(old_dentry)),
@@ -9488,7 +9488,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
 		new_inode->i_ctime = current_time(new_inode);
 		if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
 			     BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
-			ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
+			ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
 			BUG_ON(new_inode->i_nlink == 0);
 		} else {
 			ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
-- 
cgit 


From d1de429bcedac78d047c545d748c7b45fe16d56d Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 02:41:32 +0200
Subject: btrfs: pass btrfs_inode to btrfs_inode_by_name

The function is for internal interfaces so we should use the
btrfs_inode.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3577ee142888..75a0e2350cdb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5565,12 +5565,12 @@ no_delete:
  * If no dir entries were found, returns -ENOENT.
  * If found a corrupted location in dir entry, returns -EUCLEAN.
  */
-static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
+static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry,
 			       struct btrfs_key *location, u8 *type)
 {
 	struct btrfs_dir_item *di;
 	struct btrfs_path *path;
-	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct btrfs_root *root = dir->root;
 	int ret = 0;
 	struct fscrypt_name fname;
 
@@ -5578,13 +5578,13 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 	if (!path)
 		return -ENOMEM;
 
-	ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
+	ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
 	if (ret)
 		goto out;
 
 	/* This needs to handle no-key deletions later on */
 
-	di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)),
+	di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir),
 				   &fname.disk_name, 0);
 	if (IS_ERR_OR_NULL(di)) {
 		ret = di ? PTR_ERR(di) : -ENOENT;
@@ -5597,7 +5597,7 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 		ret = -EUCLEAN;
 		btrfs_warn(root->fs_info,
 "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
-			   __func__, fname.disk_name.name, btrfs_ino(BTRFS_I(dir)),
+			   __func__, fname.disk_name.name, btrfs_ino(dir),
 			   location->objectid, location->type, location->offset);
 	}
 	if (!ret)
@@ -5881,7 +5881,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 	if (dentry->d_name.len > BTRFS_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	ret = btrfs_inode_by_name(dir, dentry, &location, &di_type);
+	ret = btrfs_inode_by_name(BTRFS_I(dir), dentry, &location, &di_type);
 	if (ret < 0)
 		return ERR_PTR(ret);
 
-- 
cgit 


From 3c1b1c4c0e874dc9f2cf2faa231d39c4ca0d5888 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 02:41:32 +0200
Subject: btrfs: pass btrfs_inode to fixup_tree_root_location

The function is for internal interfaces so we should use the
btrfs_inode.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 75a0e2350cdb..a14a2b3a928b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5614,7 +5614,7 @@ out:
  * is kind of like crossing a mount point.
  */
 static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
-				    struct inode *dir,
+				    struct btrfs_inode *dir,
 				    struct dentry *dentry,
 				    struct btrfs_key *location,
 				    struct btrfs_root **sub_root)
@@ -5628,7 +5628,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
 	int err = 0;
 	struct fscrypt_name fname;
 
-	ret = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname);
+	ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 0, &fname);
 	if (ret)
 		return ret;
 
@@ -5639,7 +5639,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
 	}
 
 	err = -ENOENT;
-	key.objectid = BTRFS_I(dir)->root->root_key.objectid;
+	key.objectid = dir->root->root_key.objectid;
 	key.type = BTRFS_ROOT_REF_KEY;
 	key.offset = location->objectid;
 
@@ -5652,7 +5652,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
 
 	leaf = path->nodes[0];
 	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
-	if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(BTRFS_I(dir)) ||
+	if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
 	    btrfs_root_ref_name_len(leaf, ref) != fname.disk_name.len)
 		goto out;
 
@@ -5902,7 +5902,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 		return inode;
 	}
 
-	ret = fixup_tree_root_location(fs_info, dir, dentry,
+	ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry,
 				       &location, &sub_root);
 	if (ret < 0) {
 		if (ret != -ENOENT)
-- 
cgit 


From 4c45a4f4de1b7083800954afa4821c67df157967 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 02:41:32 +0200
Subject: btrfs: pass btrfs_inode to inode_tree_add

The function is for internal interfaces so we should use the
btrfs_inode.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a14a2b3a928b..b1a4f53cc2c4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5680,16 +5680,16 @@ out:
 	return err;
 }
 
-static void inode_tree_add(struct inode *inode)
+static void inode_tree_add(struct btrfs_inode *inode)
 {
-	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_root *root = inode->root;
 	struct btrfs_inode *entry;
 	struct rb_node **p;
 	struct rb_node *parent;
-	struct rb_node *new = &BTRFS_I(inode)->rb_node;
-	u64 ino = btrfs_ino(BTRFS_I(inode));
+	struct rb_node *new = &inode->rb_node;
+	u64 ino = btrfs_ino(inode);
 
-	if (inode_unhashed(inode))
+	if (inode_unhashed(&inode->vfs_inode))
 		return;
 	parent = NULL;
 	spin_lock(&root->inode_lock);
@@ -5801,7 +5801,7 @@ struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
 
 		ret = btrfs_read_locked_inode(inode, path);
 		if (!ret) {
-			inode_tree_add(inode);
+			inode_tree_add(BTRFS_I(inode));
 			unlock_new_inode(inode);
 		} else {
 			iget_failed(inode);
@@ -6568,7 +6568,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
 		}
 	}
 
-	inode_tree_add(inode);
+	inode_tree_add(BTRFS_I(inode));
 
 	trace_btrfs_inode_new(inode);
 	btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
-- 
cgit 


From 7a0443f031a68188a38e0845686747d4ccd2c16d Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 27 Oct 2022 02:41:32 +0200
Subject: btrfs: pass btrfs_inode to btrfs_inherit_iflags

The function is for internal interfaces so we should use the
btrfs_inode.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b1a4f53cc2c4..a242417a7e18 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6345,27 +6345,27 @@ void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args)
  *
  * Currently only the compression flags and the cow flags are inherited.
  */
-static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
+static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *dir)
 {
 	unsigned int flags;
 
-	flags = BTRFS_I(dir)->flags;
+	flags = dir->flags;
 
 	if (flags & BTRFS_INODE_NOCOMPRESS) {
-		BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
-		BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+		inode->flags &= ~BTRFS_INODE_COMPRESS;
+		inode->flags |= BTRFS_INODE_NOCOMPRESS;
 	} else if (flags & BTRFS_INODE_COMPRESS) {
-		BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
-		BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
+		inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
+		inode->flags |= BTRFS_INODE_COMPRESS;
 	}
 
 	if (flags & BTRFS_INODE_NODATACOW) {
-		BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
-		if (S_ISREG(inode->i_mode))
-			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
+		inode->flags |= BTRFS_INODE_NODATACOW;
+		if (S_ISREG(inode->vfs_inode.i_mode))
+			inode->flags |= BTRFS_INODE_NODATASUM;
 	}
 
-	btrfs_sync_inode_flags_to_i_flags(inode);
+	btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
 }
 
 int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
@@ -6424,7 +6424,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
 	 * change it now without compatibility issues.
 	 */
 	if (!args->subvol)
-		btrfs_inherit_iflags(inode, dir);
+		btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir));
 
 	if (S_ISREG(inode->i_mode)) {
 		if (btrfs_test_opt(fs_info, NODATASUM))
-- 
cgit 


From 99a81a444448ef6cde906500dbdf26aa5961e291 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 28 Oct 2022 03:37:50 +0200
Subject: btrfs: switch async_chunk::inode to btrfs_inode

The async_chunk::inode structure is for internal interfaces so we should
use the btrfs_inode.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a242417a7e18..d2409d8a147a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -520,7 +520,7 @@ struct async_extent {
 };
 
 struct async_chunk {
-	struct inode *inode;
+	struct btrfs_inode *inode;
 	struct page *locked_page;
 	u64 start;
 	u64 end;
@@ -648,7 +648,7 @@ static inline void inode_should_defrag(struct btrfs_inode *inode,
  */
 static noinline int compress_file_range(struct async_chunk *async_chunk)
 {
-	struct inode *inode = async_chunk->inode;
+	struct inode *inode = &async_chunk->inode->vfs_inode;
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	u64 blocksize = fs_info->sectorsize;
 	u64 start = async_chunk->start;
@@ -1113,7 +1113,7 @@ out_free:
  */
 static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
 {
-	struct btrfs_inode *inode = BTRFS_I(async_chunk->inode);
+	struct btrfs_inode *inode = async_chunk->inode;
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	struct async_extent *async_extent;
 	u64 alloc_hint = 0;
@@ -1494,7 +1494,7 @@ static noinline void async_cow_start(struct btrfs_work *work)
 
 	compressed_extents = compress_file_range(async_chunk);
 	if (compressed_extents == 0) {
-		btrfs_add_delayed_iput(async_chunk->inode);
+		btrfs_add_delayed_iput(&async_chunk->inode->vfs_inode);
 		async_chunk->inode = NULL;
 	}
 }
@@ -1534,7 +1534,7 @@ static noinline void async_cow_free(struct btrfs_work *work)
 
 	async_chunk = container_of(work, struct async_chunk, work);
 	if (async_chunk->inode)
-		btrfs_add_delayed_iput(async_chunk->inode);
+		btrfs_add_delayed_iput(&async_chunk->inode->vfs_inode);
 	if (async_chunk->blkcg_css)
 		css_put(async_chunk->blkcg_css);
 
@@ -1602,7 +1602,7 @@ static int cow_file_range_async(struct btrfs_inode *inode,
 		 */
 		ihold(&inode->vfs_inode);
 		async_chunk[i].async_cow = ctx;
-		async_chunk[i].inode = &inode->vfs_inode;
+		async_chunk[i].inode = inode;
 		async_chunk[i].start = start;
 		async_chunk[i].end = cur_end;
 		async_chunk[i].write_flags = write_flags;
-- 
cgit 


From 99a01bd6388e52dabb56015b939c1e9b26b0196e Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 28 Oct 2022 03:47:16 +0200
Subject: btrfs: use btrfs_inode inside compress_file_range

The function is mostly using internal interfaces so we should use the
btrfs_inode.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 37 ++++++++++++++++++-------------------
 1 file changed, 18 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d2409d8a147a..28725dd76db9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -648,8 +648,8 @@ static inline void inode_should_defrag(struct btrfs_inode *inode,
  */
 static noinline int compress_file_range(struct async_chunk *async_chunk)
 {
-	struct inode *inode = &async_chunk->inode->vfs_inode;
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_inode *inode = async_chunk->inode;
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	u64 blocksize = fs_info->sectorsize;
 	u64 start = async_chunk->start;
 	u64 end = async_chunk->end;
@@ -666,8 +666,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk)
 	int compressed_extents = 0;
 	int redirty = 0;
 
-	inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
-			SZ_16K);
+	inode_should_defrag(inode, start, end, end - start + 1, SZ_16K);
 
 	/*
 	 * We need to save i_size before now because it could change in between
@@ -679,7 +678,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk)
 	 * does that for us.
 	 */
 	barrier();
-	i_size = i_size_read(inode);
+	i_size = i_size_read(&inode->vfs_inode);
 	barrier();
 	actual_end = min_t(u64, i_size, end + 1);
 again:
@@ -708,7 +707,7 @@ again:
 	 * isn't an inline extent, since it doesn't save disk space at all.
 	 */
 	if (total_compressed <= blocksize &&
-	   (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
+	   (start > 0 || end + 1 < inode->disk_i_size))
 		goto cleanup_and_bail_uncompressed;
 
 	/*
@@ -732,7 +731,7 @@ again:
 	 * inode has not been flagged as nocompress.  This flag can
 	 * change at any time if we discover bad compression ratios.
 	 */
-	if (inode_need_compress(BTRFS_I(inode), start, end)) {
+	if (inode_need_compress(inode, start, end)) {
 		WARN_ON(pages);
 		pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
 		if (!pages) {
@@ -741,10 +740,10 @@ again:
 			goto cont;
 		}
 
-		if (BTRFS_I(inode)->defrag_compress)
-			compress_type = BTRFS_I(inode)->defrag_compress;
-		else if (BTRFS_I(inode)->prop_compress)
-			compress_type = BTRFS_I(inode)->prop_compress;
+		if (inode->defrag_compress)
+			compress_type = inode->defrag_compress;
+		else if (inode->prop_compress)
+			compress_type = inode->prop_compress;
 
 		/*
 		 * we need to call clear_page_dirty_for_io on each
@@ -759,14 +758,14 @@ again:
 		 * has moved, the end is the original one.
 		 */
 		if (!redirty) {
-			extent_range_clear_dirty_for_io(inode, start, end);
+			extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end);
 			redirty = 1;
 		}
 
 		/* Compression level is applied here and only here */
 		ret = btrfs_compress_pages(
 			compress_type | (fs_info->compress_level << 4),
-					   inode->i_mapping, start,
+					   inode->vfs_inode.i_mapping, start,
 					   pages,
 					   &nr_pages,
 					   &total_in,
@@ -795,12 +794,12 @@ cont:
 			/* we didn't compress the entire range, try
 			 * to make an uncompressed inline extent.
 			 */
-			ret = cow_file_range_inline(BTRFS_I(inode), actual_end,
+			ret = cow_file_range_inline(inode, actual_end,
 						    0, BTRFS_COMPRESS_NONE,
 						    NULL, false);
 		} else {
 			/* try making a compressed inline extent */
-			ret = cow_file_range_inline(BTRFS_I(inode), actual_end,
+			ret = cow_file_range_inline(inode, actual_end,
 						    total_compressed,
 						    compress_type, pages,
 						    false);
@@ -823,7 +822,7 @@ cont:
 			 * our outstanding extent for clearing delalloc for this
 			 * range.
 			 */
-			extent_clear_unlock_delalloc(BTRFS_I(inode), start, end,
+			extent_clear_unlock_delalloc(inode, start, end,
 						     NULL,
 						     clear_flags,
 						     PAGE_UNLOCK |
@@ -898,8 +897,8 @@ cont:
 
 		/* flag the file so we don't compress in the future */
 		if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
-		    !(BTRFS_I(inode)->prop_compress)) {
-			BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+		    !(inode->prop_compress)) {
+			inode->flags |= BTRFS_INODE_NOCOMPRESS;
 		}
 	}
 cleanup_and_bail_uncompressed:
@@ -917,7 +916,7 @@ cleanup_and_bail_uncompressed:
 	}
 
 	if (redirty)
-		extent_range_redirty_for_io(inode, start, end);
+		extent_range_redirty_for_io(&inode->vfs_inode, start, end);
 	add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
 			 BTRFS_COMPRESS_NONE);
 	compressed_extents++;
-- 
cgit 


From 5fc24314c89491cb2f53583d5e513731d58c7699 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 28 Oct 2022 03:47:16 +0200
Subject: btrfs: use btrfs_inode inside btrfs_verify_data_csum

The function is mostly using internal interfaces so we should use the
btrfs_inode.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 28725dd76db9..d3e5a5ff2d5c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3529,10 +3529,10 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
 				    u32 bio_offset, struct page *page,
 				    u64 start, u64 end)
 {
-	struct inode *inode = page->mapping->host;
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+	struct btrfs_root *root = inode->root;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct extent_io_tree *io_tree = &inode->io_tree;
 	const u32 sectorsize = root->fs_info->sectorsize;
 	u32 pg_off;
 	unsigned int result = 0;
@@ -3545,7 +3545,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
 	if (bbio->csum == NULL)
 		return 0;
 
-	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
+	if (inode->flags & BTRFS_INODE_NODATASUM)
 		return 0;
 
 	if (unlikely(test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)))
@@ -3569,7 +3569,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
 					  EXTENT_NODATASUM);
 			continue;
 		}
-		ret = btrfs_check_data_csum(BTRFS_I(inode), bbio, bio_offset, page, pg_off);
+		ret = btrfs_check_data_csum(inode, bbio, bio_offset, page, pg_off);
 		if (ret < 0) {
 			const int nr_bit = (pg_off - offset_in_page(start)) >>
 				     root->fs_info->sectorsize_bits;
-- 
cgit 


From e55cf7ca85e323028774feeb117ad94358a78070 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 28 Oct 2022 03:53:04 +0200
Subject: btrfs: pass btrfs_inode to btrfs_add_delayed_iput

The function is for internal interfaces so we should use the
btrfs_inode.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h      |  2 +-
 fs/btrfs/extent_io.c        |  2 +-
 fs/btrfs/free-space-cache.c |  4 ++--
 fs/btrfs/inode.c            | 19 +++++++++----------
 fs/btrfs/ordered-data.c     |  2 +-
 fs/btrfs/relocation.c       |  4 ++--
 fs/btrfs/tree-log.c         | 24 ++++++++++++------------
 7 files changed, 28 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 9e31dc8b0285..195c09e20609 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -501,7 +501,7 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode);
 int btrfs_orphan_cleanup(struct btrfs_root *root);
 int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size);
-void btrfs_add_delayed_iput(struct inode *inode);
+void btrfs_add_delayed_iput(struct btrfs_inode *inode);
 void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info);
 int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info);
 int btrfs_prealloc_file_range(struct inode *inode, int mode,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 05768f7f7872..859a41624c31 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3228,7 +3228,7 @@ retry:
 	if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
 		mapping->writeback_index = done_index;
 
-	btrfs_add_delayed_iput(inode);
+	btrfs_add_delayed_iput(BTRFS_I(inode));
 	return ret;
 }
 
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 627bd6120368..0d250d052487 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -260,7 +260,7 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
 	}
 	ret = btrfs_orphan_add(trans, BTRFS_I(inode));
 	if (ret) {
-		btrfs_add_delayed_iput(inode);
+		btrfs_add_delayed_iput(BTRFS_I(inode));
 		goto out;
 	}
 	clear_nlink(inode);
@@ -274,7 +274,7 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
 		spin_unlock(&block_group->lock);
 	}
 	/* One for the lookup ref */
-	btrfs_add_delayed_iput(inode);
+	btrfs_add_delayed_iput(BTRFS_I(inode));
 
 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
 	key.type = 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d3e5a5ff2d5c..aec1b232a71c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1493,7 +1493,7 @@ static noinline void async_cow_start(struct btrfs_work *work)
 
 	compressed_extents = compress_file_range(async_chunk);
 	if (compressed_extents == 0) {
-		btrfs_add_delayed_iput(&async_chunk->inode->vfs_inode);
+		btrfs_add_delayed_iput(async_chunk->inode);
 		async_chunk->inode = NULL;
 	}
 }
@@ -1533,7 +1533,7 @@ static noinline void async_cow_free(struct btrfs_work *work)
 
 	async_chunk = container_of(work, struct async_chunk, work);
 	if (async_chunk->inode)
-		btrfs_add_delayed_iput(&async_chunk->inode->vfs_inode);
+		btrfs_add_delayed_iput(async_chunk->inode);
 	if (async_chunk->blkcg_css)
 		css_put(async_chunk->blkcg_css);
 
@@ -3016,7 +3016,7 @@ out_page:
 	 * that could need flushing space. Recursing back to fixup worker would
 	 * deadlock.
 	 */
-	btrfs_add_delayed_iput(&inode->vfs_inode);
+	btrfs_add_delayed_iput(inode);
 }
 
 /*
@@ -3590,18 +3590,17 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
  * the inode to the delayed iput machinery. Delayed iputs are processed at
  * transaction commit time/superblock commit/cleaner kthread.
  */
-void btrfs_add_delayed_iput(struct inode *inode)
+void btrfs_add_delayed_iput(struct btrfs_inode *inode)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-	struct btrfs_inode *binode = BTRFS_I(inode);
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 
-	if (atomic_add_unless(&inode->i_count, -1, 1))
+	if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1))
 		return;
 
 	atomic_inc(&fs_info->nr_delayed_iputs);
 	spin_lock(&fs_info->delayed_iput_lock);
-	ASSERT(list_empty(&binode->delayed_iput));
-	list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
+	ASSERT(list_empty(&inode->delayed_iput));
+	list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs);
 	spin_unlock(&fs_info->delayed_iput_lock);
 	if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
 		wake_up_process(fs_info->cleaner_kthread);
@@ -9661,7 +9660,7 @@ static int start_delalloc_inodes(struct btrfs_root *root,
 					 &work->work);
 		} else {
 			ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
-			btrfs_add_delayed_iput(inode);
+			btrfs_add_delayed_iput(BTRFS_I(inode));
 			if (ret || wbc->nr_to_write <= 0)
 				goto out;
 		}
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 8fda1949b71b..4bed0839b640 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -504,7 +504,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
 		ASSERT(list_empty(&entry->log_list));
 		ASSERT(RB_EMPTY_NODE(&entry->rb_node));
 		if (entry->inode)
-			btrfs_add_delayed_iput(entry->inode);
+			btrfs_add_delayed_iput(BTRFS_I(entry->inode));
 		while (!list_empty(&entry->list)) {
 			cur = entry->list.next;
 			sum = list_entry(cur, struct btrfs_ordered_sum, list);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 1440cb332a5a..8914aa920bb7 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1117,7 +1117,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
 				inode = find_next_inode(root, key.objectid);
 				first = 0;
 			} else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) {
-				btrfs_add_delayed_iput(inode);
+				btrfs_add_delayed_iput(BTRFS_I(inode));
 				inode = find_next_inode(root, key.objectid);
 			}
 			if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) {
@@ -1181,7 +1181,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
 	if (dirty)
 		btrfs_mark_buffer_dirty(leaf);
 	if (inode)
-		btrfs_add_delayed_iput(inode);
+		btrfs_add_delayed_iput(BTRFS_I(inode));
 	return ret;
 }
 
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index b6e99ef99679..f7b1bb9c63e4 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -5473,7 +5473,7 @@ again:
 			}
 
 			if (!need_log_inode(trans, BTRFS_I(di_inode))) {
-				btrfs_add_delayed_iput(di_inode);
+				btrfs_add_delayed_iput(BTRFS_I(di_inode));
 				break;
 			}
 
@@ -5482,7 +5482,7 @@ again:
 				log_mode = LOG_INODE_ALL;
 			ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
 					      log_mode, ctx);
-			btrfs_add_delayed_iput(di_inode);
+			btrfs_add_delayed_iput(BTRFS_I(di_inode));
 			if (ret)
 				goto out;
 			if (ctx->log_new_dentries) {
@@ -5676,11 +5676,11 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans,
 	 * so that the log ends up with the new name and without the old name.
 	 */
 	if (!need_log_inode(trans, BTRFS_I(inode))) {
-		btrfs_add_delayed_iput(inode);
+		btrfs_add_delayed_iput(BTRFS_I(inode));
 		return 0;
 	}
 
-	btrfs_add_delayed_iput(inode);
+	btrfs_add_delayed_iput(BTRFS_I(inode));
 
 	ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
 	if (!ino_elem)
@@ -5755,7 +5755,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
 			 */
 			ret = btrfs_log_inode(trans, BTRFS_I(inode),
 					      LOG_INODE_ALL, ctx);
-			btrfs_add_delayed_iput(inode);
+			btrfs_add_delayed_iput(BTRFS_I(inode));
 			if (ret)
 				break;
 			continue;
@@ -5772,7 +5772,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
 		 * that, we can avoid doing it again.
 		 */
 		if (!need_log_inode(trans, BTRFS_I(inode))) {
-			btrfs_add_delayed_iput(inode);
+			btrfs_add_delayed_iput(BTRFS_I(inode));
 			continue;
 		}
 
@@ -5784,7 +5784,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
 		 * log with the new name before we unpin it.
 		 */
 		ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx);
-		btrfs_add_delayed_iput(inode);
+		btrfs_add_delayed_iput(BTRFS_I(inode));
 		if (ret)
 			break;
 	}
@@ -6294,7 +6294,7 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
 		}
 
 		if (!need_log_inode(trans, BTRFS_I(di_inode))) {
-			btrfs_add_delayed_iput(di_inode);
+			btrfs_add_delayed_iput(BTRFS_I(di_inode));
 			continue;
 		}
 
@@ -6307,7 +6307,7 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
 		if (!ret && ctx->log_new_dentries)
 			ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx);
 
-		btrfs_add_delayed_iput(di_inode);
+		btrfs_add_delayed_iput(BTRFS_I(di_inode));
 
 		if (ret)
 			break;
@@ -6768,7 +6768,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
 			}
 
 			if (!need_log_inode(trans, BTRFS_I(dir_inode))) {
-				btrfs_add_delayed_iput(dir_inode);
+				btrfs_add_delayed_iput(BTRFS_I(dir_inode));
 				continue;
 			}
 
@@ -6778,7 +6778,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
 			if (!ret && ctx->log_new_dentries)
 				ret = log_new_dir_dentries(trans,
 						   BTRFS_I(dir_inode), ctx);
-			btrfs_add_delayed_iput(dir_inode);
+			btrfs_add_delayed_iput(BTRFS_I(dir_inode));
 			if (ret)
 				goto out;
 		}
@@ -6823,7 +6823,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
 		    need_log_inode(trans, BTRFS_I(inode)))
 			ret = btrfs_log_inode(trans, BTRFS_I(inode),
 					      LOG_INODE_EXISTS, ctx);
-		btrfs_add_delayed_iput(inode);
+		btrfs_add_delayed_iput(BTRFS_I(inode));
 		if (ret)
 			return ret;
 
-- 
cgit 


From 2942a50dea74126bf3395b3060a808fb046136fc Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 7 Nov 2022 15:32:29 +0800
Subject: btrfs: raid56: introduce btrfs_raid_bio::error_bitmap

Currently btrfs raid56 uses btrfs_raid_bio::faila and failb to indicate
which stripe(s) had IO errors.

But that has some problems:

- If one sector failed csum check, the whole stripe where the corruption
  is will be marked error.
  This can reduce the chance we do recover, like this:

          0  4K 8K
  Data 1  |XX|  |
  Data 2  |  |XX|
  Parity  |  |  |

  In above case, 0~4K in data 1 should be recovered using data 2 and
  parity, while 4K~8K in data 2 should be recovered using data 1 and
  parity.

  Currently if we trigger read on 0~4K of data 1, we will also recover
  4K~8K of data 1 using corrupted data 2 and parity, causing wrong
  result in rbio cache.

- Harder to expand for future M-N scheme
  As we're limited to just faila/b, two corruptions.

- Harder to expand to handle extra csum errors
  This can be problematic if we start to do csum verification.

This patch will introduce an extra @error_bitmap, where one bit
represents error that happened for that sector.

The choice to introduce a new error bitmap other than reusing
sector_ptr, is to avoid extra search between rbio::stripe_sectors[] and
rbio::bio_sectors[].

Since we can submit bio using sectors from both sectors, doing proper
search on both array will more complex.

Although the new bitmap will take extra memory, later we can remove
things like @error and faila/b to save some memory.

Currently the new error bitmap and failab mechanism coexists, the error
bitmap is only updated at endio time and recover entrance.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++----
 fs/btrfs/raid56.h | 11 +++++++
 2 files changed, 103 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 629237102189..e14968a37c08 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -76,6 +76,7 @@ static void scrub_rbio_work_locked(struct work_struct *work);
 
 static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio)
 {
+	bitmap_free(rbio->error_bitmap);
 	kfree(rbio->stripe_pages);
 	kfree(rbio->bio_sectors);
 	kfree(rbio->stripe_sectors);
@@ -950,9 +951,10 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
 	rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr),
 				       GFP_NOFS);
 	rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS);
+	rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS);
 
 	if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors ||
-	    !rbio->finish_pointers) {
+	    !rbio->finish_pointers || !rbio->error_bitmap) {
 		free_raid_bio_pointers(rbio);
 		kfree(rbio);
 		return ERR_PTR(-ENOMEM);
@@ -1044,8 +1046,11 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
 	disk_start = stripe->physical + sector_nr * sectorsize;
 
 	/* if the device is missing, just fail this stripe */
-	if (!stripe->dev->bdev)
+	if (!stripe->dev->bdev) {
+		set_bit(stripe_nr * rbio->stripe_nsectors + sector_nr,
+			rbio->error_bitmap);
 		return fail_rbio_index(rbio, stripe_nr);
+	}
 
 	/* see if we can add this page onto our existing bio */
 	if (last) {
@@ -1209,6 +1214,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
 	 * write.
 	 */
 	atomic_set(&rbio->error, 0);
+	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
 	rbio->faila = -1;
 	rbio->failb = -1;
 
@@ -1332,6 +1338,40 @@ static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
 	return -1;
 }
 
+static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio)
+{
+	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
+	u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
+		     rbio->bioc->raid_map[0];
+	int total_nr_sector = offset >> fs_info->sectorsize_bits;
+
+	ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors);
+
+	bitmap_set(rbio->error_bitmap, total_nr_sector,
+		   bio->bi_iter.bi_size >> fs_info->sectorsize_bits);
+
+	/*
+	 * Special handling for raid56_alloc_missing_rbio() used by
+	 * scrub/replace.  Unlike call path in raid56_parity_recover(), they
+	 * pass an empty bio here.  Thus we have to find out the missing device
+	 * and mark the stripe error instead.
+	 */
+	if (bio->bi_iter.bi_size == 0) {
+		bool found_missing = false;
+		int stripe_nr;
+
+		for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
+			if (!rbio->bioc->stripes[stripe_nr].dev->bdev) {
+				found_missing = true;
+				bitmap_set(rbio->error_bitmap,
+					   stripe_nr * rbio->stripe_nsectors,
+					   rbio->stripe_nsectors);
+			}
+		}
+		ASSERT(found_missing);
+	}
+}
+
 /*
  * returns -EIO if we had too many failures
  */
@@ -1423,14 +1463,49 @@ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
 	}
 }
 
+static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio)
+{
+	struct bio_vec *bv = bio_first_bvec_all(bio);
+	int i;
+
+	for (i = 0; i < rbio->nr_sectors; i++) {
+		struct sector_ptr *sector;
+
+		sector = &rbio->stripe_sectors[i];
+		if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset)
+			break;
+		sector = &rbio->bio_sectors[i];
+		if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset)
+			break;
+	}
+	ASSERT(i < rbio->nr_sectors);
+	return i;
+}
+
+static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bio)
+{
+	int total_sector_nr = get_bio_sector_nr(rbio, bio);
+	u32 bio_size = 0;
+	struct bio_vec *bvec;
+	struct bvec_iter_all iter_all;
+
+	bio_for_each_segment_all(bvec, bio, iter_all)
+		bio_size += bvec->bv_len;
+
+	bitmap_set(rbio->error_bitmap, total_sector_nr,
+		   bio_size >> rbio->bioc->fs_info->sectorsize_bits);
+}
+
 static void raid_wait_read_end_io(struct bio *bio)
 {
 	struct btrfs_raid_bio *rbio = bio->bi_private;
 
-	if (bio->bi_status)
+	if (bio->bi_status) {
 		fail_bio_stripe(rbio, bio);
-	else
+		rbio_update_error_bitmap(rbio, bio);
+	} else {
 		set_bio_pages_uptodate(rbio, bio);
+	}
 
 	bio_put(bio);
 	if (atomic_dec_and_test(&rbio->stripes_pending))
@@ -1863,10 +1938,10 @@ static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio,
 		struct sector_ptr *sector;
 
 		if (rbio->faila == stripe || rbio->failb == stripe) {
-			atomic_inc(&rbio->error);
 			/* Skip the current stripe. */
 			ASSERT(sectornr == 0);
 			total_sector_nr += rbio->stripe_nsectors - 1;
+			atomic_inc(&rbio->error);
 			continue;
 		}
 		sector = rbio_stripe_sector(rbio, stripe, sectornr);
@@ -1891,9 +1966,10 @@ static int recover_rbio(struct btrfs_raid_bio *rbio)
 
 	/*
 	 * Either we're doing recover for a read failure or degraded write,
-	 * caller should have set faila/b correctly.
+	 * caller should have set faila/b and error bitmap correctly.
 	 */
 	ASSERT(rbio->faila >= 0 || rbio->failb >= 0);
+	ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors));
 	bio_list_init(&bio_list);
 
 	/*
@@ -1978,6 +2054,8 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
 	rbio->operation = BTRFS_RBIO_READ_REBUILD;
 	rbio_add_bio(rbio, bio);
 
+	set_rbio_range_error(rbio, bio);
+
 	rbio->faila = find_logical_bio_stripe(rbio, bio);
 	if (rbio->faila == -1) {
 		btrfs_warn(fs_info,
@@ -2038,8 +2116,10 @@ static void raid_wait_write_end_io(struct bio *bio)
 	struct btrfs_raid_bio *rbio = bio->bi_private;
 	blk_status_t err = bio->bi_status;
 
-	if (err)
+	if (err) {
 		fail_bio_stripe(rbio, bio);
+		rbio_update_error_bitmap(rbio, bio);
+	}
 	bio_put(bio);
 	if (atomic_dec_and_test(&rbio->stripes_pending))
 		wake_up(&rbio->io_wait);
@@ -2117,6 +2197,7 @@ write:
 	spin_unlock_irq(&rbio->bio_list_lock);
 
 	atomic_set(&rbio->error, 0);
+	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
 
 	index_rbio_pages(rbio);
 
@@ -2328,6 +2409,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check)
 	}
 
 	atomic_set(&rbio->error, 0);
+	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
 
 	/* Map the parity stripe just once */
 	pointers[nr_data] = kmap_local_page(p_sector.page);
@@ -2533,6 +2615,8 @@ static int scrub_rbio(struct btrfs_raid_bio *rbio)
 		goto cleanup;
 
 	atomic_set(&rbio->error, 0);
+	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
+
 	ret = scrub_assemble_read_bios(rbio, &bio_list);
 	if (ret < 0)
 		goto cleanup;
@@ -2612,6 +2696,7 @@ raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc)
 	 */
 	ASSERT(!bio->bi_iter.bi_size);
 
+	set_rbio_range_error(rbio, bio);
 	rbio->faila = find_logical_bio_stripe(rbio, bio);
 	if (rbio->faila == -1) {
 		btrfs_warn_rl(fs_info,
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 9fae97b7a2a5..e38da4fa76d6 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -126,6 +126,17 @@ struct btrfs_raid_bio {
 
 	/* Allocated with real_stripes-many pointers for finish_*() calls */
 	void **finish_pointers;
+
+	/*
+	 * The bitmap recording where IO errors happened.
+	 * Each bit is corresponding to one sector in either bio_sectors[] or
+	 * stripe_sectors[] array.
+	 *
+	 * The reason we don't use another bit in sector_ptr is, we have two
+	 * arrays of sectors, and a lot of IO can use sectors in both arrays.
+	 * Thus making it much harder to iterate.
+	 */
+	unsigned long *error_bitmap;
 };
 
 /*
-- 
cgit 


From 75b47033296595efb208cc563cbb8cf4fb7c3ebc Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 7 Nov 2022 15:32:30 +0800
Subject: btrfs: raid56: migrate recovery and scrub recovery path to use
 error_bitmap

Since we have rbio::error_bitmap to indicate exactly where the errors
are (including read error and csum mismatch error), we can make recovery
path more accurate.

For example:

             0        32K       64K
     Data 1  |XXXXXXXX|         |
     Data 2  |        |XXXXXXXXX|
     Parity  |        |         |

1) Get csum mismatch when reading data 1 [0, 32K)

2) Mark corresponding range error
   The old code will mark the whole data 1 stripe as error.
   While the new code will only mark data 1 [0, 32K) as error.

3) Recovery path
   The old code will recover data 1 [0, 64K), all using Data 2 and
   parity.

   This means, Data 1 [32K, 64K) will be corrupted data, as data 2
   [32K, 64K) is already corrupted.

   While the new code will only recover data 1 [0, 32K), as only
   that range has error so far.

This new behavior can avoid populating rbio cache with incorrect data.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c | 279 +++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 193 insertions(+), 86 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index e14968a37c08..fd67ba4648c9 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1013,6 +1013,36 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
 	return 0;
 }
 
+/*
+ * Return the total numer of errors found in the vertical stripe of @sector_nr.
+ *
+ * @faila and @failb will also be updated to the first and second stripe
+ * number of the errors.
+ */
+static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr,
+				     int *faila, int *failb)
+{
+	int stripe_nr;
+	int found_errors = 0;
+
+	ASSERT(faila && failb);
+	*faila = -1;
+	*failb = -1;
+
+	for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
+		int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr;
+
+		if (test_bit(total_sector_nr, rbio->error_bitmap)) {
+			found_errors++;
+			if (*faila < 0)
+				*faila = stripe_nr;
+			else if (*failb < 0)
+				*failb = stripe_nr;
+		}
+	}
+	return found_errors;
+}
+
 /*
  * Add a single sector @sector into our list of bios for IO.
  *
@@ -1740,14 +1770,15 @@ fail:
  * @*pointers are the pre-allocated pointers by the caller, so we don't
  * need to allocate/free the pointers again and again.
  */
-static void recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
-			     void **pointers, void **unmap_array)
+static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
+			    void **pointers, void **unmap_array)
 {
 	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
 	struct sector_ptr *sector;
 	const u32 sectorsize = fs_info->sectorsize;
-	const int faila = rbio->faila;
-	const int failb = rbio->failb;
+	int found_errors;
+	int faila;
+	int failb;
 	int stripe_nr;
 
 	/*
@@ -1756,7 +1787,19 @@ static void recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
 	 */
 	if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
 	    !test_bit(sector_nr, &rbio->dbitmap))
-		return;
+		return 0;
+
+	found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila,
+						 &failb);
+	/*
+	 * No errors in the veritical stripe, skip it.  Can happen for recovery
+	 * which only part of a stripe failed csum check.
+	 */
+	if (!found_errors)
+		return 0;
+
+	if (found_errors > rbio->bioc->max_errors)
+		return -EIO;
 
 	/*
 	 * Setup our array of pointers with sectors from each stripe
@@ -1766,12 +1809,11 @@ static void recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
 	 */
 	for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
 		/*
-		 * If we're rebuilding a read, we have to use
-		 * pages from the bio list
+		 * If we're rebuilding a read, we have to use pages from the
+		 * bio list if possible.
 		 */
 		if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
-		     rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
-		    (stripe_nr == faila || stripe_nr == failb)) {
+		     rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) {
 			sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
 		} else {
 			sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
@@ -1859,18 +1901,19 @@ pstripe:
 	 * Especially if we determine to cache the rbio, we need to
 	 * have at least all data sectors uptodate.
 	 */
-	if (rbio->faila >= 0) {
-		sector = rbio_stripe_sector(rbio, rbio->faila, sector_nr);
+	if (faila >= 0) {
+		sector = rbio_stripe_sector(rbio, faila, sector_nr);
 		sector->uptodate = 1;
 	}
-	if (rbio->failb >= 0) {
-		sector = rbio_stripe_sector(rbio, rbio->failb, sector_nr);
+	if (failb >= 0) {
+		sector = rbio_stripe_sector(rbio, failb, sector_nr);
 		sector->uptodate = 1;
 	}
 
 cleanup:
 	for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--)
 		kunmap_local(unmap_array[stripe_nr]);
+	return 0;
 }
 
 static int recover_sectors(struct btrfs_raid_bio *rbio)
@@ -1893,10 +1936,6 @@ static int recover_sectors(struct btrfs_raid_bio *rbio)
 		goto out;
 	}
 
-	/* Make sure faila and fail b are in order. */
-	if (rbio->faila >= 0 && rbio->failb >= 0 && rbio->faila > rbio->failb)
-		swap(rbio->faila, rbio->failb);
-
 	if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
 	    rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
 		spin_lock_irq(&rbio->bio_list_lock);
@@ -1906,8 +1945,11 @@ static int recover_sectors(struct btrfs_raid_bio *rbio)
 
 	index_rbio_pages(rbio);
 
-	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++)
-		recover_vertical(rbio, sectornr, pointers, unmap_array);
+	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
+		ret = recover_vertical(rbio, sectornr, pointers, unmap_array);
+		if (ret < 0)
+			break;
+	}
 
 out:
 	kfree(pointers);
@@ -1937,13 +1979,21 @@ static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio,
 		int sectornr = total_sector_nr % rbio->stripe_nsectors;
 		struct sector_ptr *sector;
 
-		if (rbio->faila == stripe || rbio->failb == stripe) {
-			/* Skip the current stripe. */
-			ASSERT(sectornr == 0);
-			total_sector_nr += rbio->stripe_nsectors - 1;
-			atomic_inc(&rbio->error);
+		/*
+		 * Skip the range which has error.  It can be a range which is
+		 * marked error (for csum mismatch), or it can be a missing
+		 * device.
+		 */
+		if (!rbio->bioc->stripes[stripe].dev->bdev ||
+		    test_bit(total_sector_nr, rbio->error_bitmap)) {
+			/*
+			 * Also set the error bit for missing device, which
+			 * may not yet have its error bit set.
+			 */
+			set_bit(total_sector_nr, rbio->error_bitmap);
 			continue;
 		}
+
 		sector = rbio_stripe_sector(rbio, stripe, sectornr);
 		ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
 					 sectornr, REQ_OP_READ);
@@ -1966,9 +2016,8 @@ static int recover_rbio(struct btrfs_raid_bio *rbio)
 
 	/*
 	 * Either we're doing recover for a read failure or degraded write,
-	 * caller should have set faila/b and error bitmap correctly.
+	 * caller should have set error bitmap correctly.
 	 */
-	ASSERT(rbio->faila >= 0 || rbio->failb >= 0);
 	ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors));
 	bio_list_init(&bio_list);
 
@@ -1992,12 +2041,6 @@ static int recover_rbio(struct btrfs_raid_bio *rbio)
 	submit_read_bios(rbio, &bio_list);
 	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
 
-	/* We have more errors than our tolerance during the read. */
-	if (atomic_read(&rbio->error) > rbio->bioc->max_errors) {
-		ret = -EIO;
-		goto out;
-	}
-
 	ret = recover_sectors(rbio);
 
 out:
@@ -2032,6 +2075,51 @@ static void recover_rbio_work_locked(struct work_struct *work)
 	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
 }
 
+static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num)
+{
+	bool found = false;
+	int sector_nr;
+
+	/*
+	 * This is for RAID6 extra recovery tries, thus mirror number should
+	 * be large than 2.
+	 * Mirror 1 means read from data stripes. Mirror 2 means rebuild using
+	 * RAID5 methods.
+	 */
+	ASSERT(mirror_num > 2);
+	for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
+		int found_errors;
+		int faila;
+		int failb;
+
+		found_errors = get_rbio_veritical_errors(rbio, sector_nr,
+							 &faila, &failb);
+		/* This vertical stripe doesn't have errors. */
+		if (!found_errors)
+			continue;
+
+		/*
+		 * If we found errors, there should be only one error marked
+		 * by previous set_rbio_range_error().
+		 */
+		ASSERT(found_errors == 1);
+		found = true;
+
+		/* Now select another stripe to mark as error. */
+		failb = rbio->real_stripes - (mirror_num - 1);
+		if (failb <= faila)
+			failb--;
+
+		/* Set the extra bit in error bitmap. */
+		if (failb >= 0)
+			set_bit(failb * rbio->stripe_nsectors + sector_nr,
+				rbio->error_bitmap);
+	}
+
+	/* We should found at least one vertical stripe with error.*/
+	ASSERT(found);
+}
+
 /*
  * the main entry point for reads from the higher layers.  This
  * is really only called when the normal read path had a failure,
@@ -2074,11 +2162,7 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
 	 * for 'mirror_num > 2', select a stripe to fail on every retry.
 	 */
 	if (mirror_num > 2) {
-		/*
-		 * 'mirror == 3' is to fail the p stripe and
-		 * reconstruct from the q stripe.  'mirror > 3' is to
-		 * fail a data stripe and reconstruct from p+q stripe.
-		 */
+		set_rbio_raid6_extra_error(rbio, mirror_num);
 		rbio->failb = rbio->real_stripes - (mirror_num - 1);
 		ASSERT(rbio->failb > 0);
 		if (rbio->failb <= rbio->faila)
@@ -2507,48 +2591,85 @@ static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
 
 static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
 {
-	int dfail = 0, failp = -1;
+	void **pointers = NULL;
+	void **unmap_array = NULL;
+	int sector_nr;
 	int ret;
 
-	/* No error case should be already handled by the caller. */
-	ASSERT(rbio->faila >= 0 || rbio->failb >= 0);
+	/*
+	 * @pointers array stores the pointer for each sector.
+	 *
+	 * @unmap_array stores copy of pointers that does not get reordered
+	 * during reconstruction so that kunmap_local works.
+	 */
+	pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
+	unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
+	if (!pointers || !unmap_array) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
-	if (is_data_stripe(rbio, rbio->faila))
-		dfail++;
-	else if (is_parity_stripe(rbio->faila))
-		failp = rbio->faila;
+	for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
+		int dfail = 0, failp = -1;
+		int faila;
+		int failb;
+		int found_errors;
 
-	if (is_data_stripe(rbio, rbio->failb))
-		dfail++;
-	else if (is_parity_stripe(rbio->failb))
-		failp = rbio->failb;
+		found_errors = get_rbio_veritical_errors(rbio, sector_nr,
+							 &faila, &failb);
+		if (found_errors > rbio->bioc->max_errors) {
+			ret = -EIO;
+			goto out;
+		}
+		if (found_errors == 0)
+			continue;
 
-	/*
-	 * Because we can not use a scrubbing parity to repair
-	 * the data, so the capability of the repair is declined.
-	 * (In the case of RAID5, we can not repair anything)
-	 */
-	if (dfail > rbio->bioc->max_errors - 1)
-		return -EIO;
+		/* We should have at least one error here. */
+		ASSERT(faila >= 0 || failb >= 0);
 
-	/*
-	 * If all data is good, only parity is correctly, just
-	 * repair the parity.
-	 */
-	if (dfail == 0)
-		return 0;
+		if (is_data_stripe(rbio, faila))
+			dfail++;
+		else if (is_parity_stripe(faila))
+			failp = faila;
 
-	/*
-	 * Here means we got one corrupted data stripe and one
-	 * corrupted parity on RAID6, if the corrupted parity
-	 * is scrubbing parity, luckily, use the other one to repair
-	 * the data, or we can not repair the data stripe.
-	 */
-	if (failp != rbio->scrubp)
-		return -EIO;
+		if (is_data_stripe(rbio, failb))
+			dfail++;
+		else if (is_parity_stripe(failb))
+			failp = failb;
+		/*
+		 * Because we can not use a scrubbing parity to repair the
+		 * data, so the capability of the repair is declined.  (In the
+		 * case of RAID5, we can not repair anything.)
+		 */
+		if (dfail > rbio->bioc->max_errors - 1) {
+			ret = -EIO;
+			goto out;
+		}
+		/*
+		 * If all data is good, only parity is correctly, just repair
+		 * the parity, no need to recover data stripes.
+		 */
+		if (dfail == 0)
+			continue;
 
-	/* We have some corrupted sectors, need to repair them. */
-	ret = recover_sectors(rbio);
+		/*
+		 * Here means we got one corrupted data stripe and one
+		 * corrupted parity on RAID6, if the corrupted parity is
+		 * scrubbing parity, luckily, use the other one to repair the
+		 * data, or we can not repair the data stripe.
+		 */
+		if (failp != rbio->scrubp) {
+			ret = -EIO;
+			goto out;
+		}
+
+		ret = recover_vertical(rbio, sector_nr, pointers, unmap_array);
+		if (ret < 0)
+			goto out;
+	}
+out:
+	kfree(pointers);
+	kfree(unmap_array);
 	return ret;
 }
 
@@ -2624,25 +2745,11 @@ static int scrub_rbio(struct btrfs_raid_bio *rbio)
 	submit_read_bios(rbio, &bio_list);
 	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
 
-	if (atomic_read(&rbio->error) > rbio->bioc->max_errors) {
-		ret = -EIO;
-		goto cleanup;
-	}
-	/*
-	 * No error during read, can finish the scrub and need to verify the
-	 * P/Q sectors;
-	 */
-	if (atomic_read(&rbio->error) == 0) {
-		need_check = true;
-		goto finish;
-	}
-
-	/* We have some failures, need to recover the failed sectors first. */
+	/* We may have some failures, recover the failed sectors first. */
 	ret = recover_scrub_rbio(rbio);
 	if (ret < 0)
 		goto cleanup;
 
-finish:
 	/*
 	 * We have every sector properly prepared. Can finish the scrub
 	 * and writeback the good content.
-- 
cgit 


From ad3daf1c3f5bcbba49a302d8d0e46467556bf6f3 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 7 Nov 2022 15:32:31 +0800
Subject: btrfs: raid56: remove the old error tracking system

Since all the recovery paths have been migrated to the new error bitmap
based system, we can remove the old stripe number based system.

This cleanup involves one behavior change:

- Rebuild rbio can no longer be merged
  Previously a rebuild rbio (caused by retry after data csum mismatch)
  can be merged, if the error happens in the same stripe.

  But with the new error bitmap based solution, it's much harder to
  compare error bitmaps.

  So here we just don't merge rebuild rbio at all.
  This may introduce some performance impact at extreme corner cases,
  but we're willing to take it.

Other than that, this patch will cleanup the following members:

- rbio::faila
- rbio::failb
  They will be replaced by per-vertical stripe check, which is more
  accurate.

- rbio::error
  It will be replace by per-vertical stripe error bitmap check.

- Allow get_rbio_vertical_errors() to accept NULL pointers for
  @faila and @failb
  Some call sites only want to check if we have errors beyond the
  tolerance.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c | 234 +++++++++++++-----------------------------------------
 fs/btrfs/raid56.h |   8 --
 2 files changed, 53 insertions(+), 189 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index fd67ba4648c9..11be5d0a7eab 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -66,8 +66,6 @@ struct sector_ptr {
 
 static void rmw_rbio_work(struct work_struct *work);
 static void rmw_rbio_work_locked(struct work_struct *work);
-static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
-static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
 static void index_rbio_pages(struct btrfs_raid_bio *rbio);
 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
 
@@ -588,28 +586,10 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
 	if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
 		return 0;
 
-	if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
+	if (last->operation == BTRFS_RBIO_REBUILD_MISSING ||
+	    last->operation == BTRFS_RBIO_READ_REBUILD)
 		return 0;
 
-	if (last->operation == BTRFS_RBIO_READ_REBUILD) {
-		int fa = last->faila;
-		int fb = last->failb;
-		int cur_fa = cur->faila;
-		int cur_fb = cur->failb;
-
-		if (last->faila >= last->failb) {
-			fa = last->failb;
-			fb = last->faila;
-		}
-
-		if (cur->faila >= cur->failb) {
-			cur_fa = cur->failb;
-			cur_fb = cur->faila;
-		}
-
-		if (fa != cur_fa || fb != cur_fb)
-			return 0;
-	}
 	return 1;
 }
 
@@ -973,10 +953,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
 	rbio->real_stripes = real_stripes;
 	rbio->stripe_npages = stripe_npages;
 	rbio->stripe_nsectors = stripe_nsectors;
-	rbio->faila = -1;
-	rbio->failb = -1;
 	refcount_set(&rbio->refs, 1);
-	atomic_set(&rbio->error, 0);
 	atomic_set(&rbio->stripes_pending, 0);
 
 	ASSERT(btrfs_nr_parity_stripes(bioc->map_type));
@@ -1025,19 +1002,28 @@ static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr,
 	int stripe_nr;
 	int found_errors = 0;
 
-	ASSERT(faila && failb);
-	*faila = -1;
-	*failb = -1;
+	if (faila || failb) {
+		/*
+		 * Both @faila and @failb should be valid pointers if any of
+		 * them is specified.
+		 */
+		ASSERT(faila && failb);
+		*faila = -1;
+		*failb = -1;
+	}
 
 	for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
 		int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr;
 
 		if (test_bit(total_sector_nr, rbio->error_bitmap)) {
 			found_errors++;
-			if (*faila < 0)
-				*faila = stripe_nr;
-			else if (*failb < 0)
-				*failb = stripe_nr;
+			if (faila) {
+				/* Update faila and failb. */
+				if (*faila < 0)
+					*faila = stripe_nr;
+				else if (*failb < 0)
+					*failb = stripe_nr;
+			}
 		}
 	}
 	return found_errors;
@@ -1077,9 +1063,17 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
 
 	/* if the device is missing, just fail this stripe */
 	if (!stripe->dev->bdev) {
+		int found_errors;
+
 		set_bit(stripe_nr * rbio->stripe_nsectors + sector_nr,
 			rbio->error_bitmap);
-		return fail_rbio_index(rbio, stripe_nr);
+
+		/* Check if we have reached tolerance early. */
+		found_errors = get_rbio_veritical_errors(rbio, sector_nr,
+							 NULL, NULL);
+		if (found_errors > rbio->bioc->max_errors)
+			return -EIO;
+		return 0;
 	}
 
 	/* see if we can add this page onto our existing bio */
@@ -1243,10 +1237,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
 	 * Reset errors, as we may have errors inherited from from degraded
 	 * write.
 	 */
-	atomic_set(&rbio->error, 0);
 	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
-	rbio->faila = -1;
-	rbio->failb = -1;
 
 	/*
 	 * Start assembly.  Make bios for everything from the higher layers (the
@@ -1324,50 +1315,6 @@ error:
 	return -EIO;
 }
 
-/*
- * helper to find the stripe number for a given bio.  Used to figure out which
- * stripe has failed.  This expects the bio to correspond to a physical disk,
- * so it looks up based on physical sector numbers.
- */
-static int find_bio_stripe(struct btrfs_raid_bio *rbio,
-			   struct bio *bio)
-{
-	u64 physical = bio->bi_iter.bi_sector;
-	int i;
-	struct btrfs_io_stripe *stripe;
-
-	physical <<= 9;
-
-	for (i = 0; i < rbio->bioc->num_stripes; i++) {
-		stripe = &rbio->bioc->stripes[i];
-		if (in_range(physical, stripe->physical, BTRFS_STRIPE_LEN) &&
-		    stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
-			return i;
-		}
-	}
-	return -1;
-}
-
-/*
- * helper to find the stripe number for a given
- * bio (before mapping).  Used to figure out which stripe has
- * failed.  This looks up based on logical block numbers.
- */
-static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
-				   struct bio *bio)
-{
-	u64 logical = bio->bi_iter.bi_sector << 9;
-	int i;
-
-	for (i = 0; i < rbio->nr_data; i++) {
-		u64 stripe_start = rbio->bioc->raid_map[i];
-
-		if (in_range(logical, stripe_start, BTRFS_STRIPE_LEN))
-			return i;
-	}
-	return -1;
-}
-
 static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio)
 {
 	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
@@ -1402,52 +1349,6 @@ static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio)
 	}
 }
 
-/*
- * returns -EIO if we had too many failures
- */
-static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
-{
-	unsigned long flags;
-	int ret = 0;
-
-	spin_lock_irqsave(&rbio->bio_list_lock, flags);
-
-	/* we already know this stripe is bad, move on */
-	if (rbio->faila == failed || rbio->failb == failed)
-		goto out;
-
-	if (rbio->faila == -1) {
-		/* first failure on this rbio */
-		rbio->faila = failed;
-		atomic_inc(&rbio->error);
-	} else if (rbio->failb == -1) {
-		/* second failure on this rbio */
-		rbio->failb = failed;
-		atomic_inc(&rbio->error);
-	} else {
-		ret = -EIO;
-	}
-out:
-	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
-
-	return ret;
-}
-
-/*
- * helper to fail a stripe based on a physical disk
- * bio.
- */
-static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
-			   struct bio *bio)
-{
-	int failed = find_bio_stripe(rbio, bio);
-
-	if (failed < 0)
-		return -EIO;
-
-	return fail_rbio_index(rbio, failed);
-}
-
 /*
  * For subpage case, we can no longer set page Uptodate directly for
  * stripe_pages[], thus we need to locate the sector.
@@ -1530,12 +1431,10 @@ static void raid_wait_read_end_io(struct bio *bio)
 {
 	struct btrfs_raid_bio *rbio = bio->bi_private;
 
-	if (bio->bi_status) {
-		fail_bio_stripe(rbio, bio);
+	if (bio->bi_status)
 		rbio_update_error_bitmap(rbio, bio);
-	} else {
+	else
 		set_bio_pages_uptodate(rbio, bio);
-	}
 
 	bio_put(bio);
 	if (atomic_dec_and_test(&rbio->stripes_pending))
@@ -2021,12 +1920,6 @@ static int recover_rbio(struct btrfs_raid_bio *rbio)
 	ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors));
 	bio_list_init(&bio_list);
 
-	/*
-	 * Reset error to 0, as we will later increase error for missing
-	 * devices.
-	 */
-	atomic_set(&rbio->error, 0);
-
 	/* For recovery, we need to read all sectors including P/Q. */
 	ret = alloc_rbio_pages(rbio);
 	if (ret < 0)
@@ -2144,30 +2037,13 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
 
 	set_rbio_range_error(rbio, bio);
 
-	rbio->faila = find_logical_bio_stripe(rbio, bio);
-	if (rbio->faila == -1) {
-		btrfs_warn(fs_info,
-"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)",
-			   __func__, bio->bi_iter.bi_sector << 9,
-			   (u64)bio->bi_iter.bi_size, bioc->map_type);
-		free_raid_bio(rbio);
-		bio->bi_status = BLK_STS_IOERR;
-		bio_endio(bio);
-		return;
-	}
-
 	/*
 	 * Loop retry:
 	 * for 'mirror == 2', reconstruct from all other stripes.
 	 * for 'mirror_num > 2', select a stripe to fail on every retry.
 	 */
-	if (mirror_num > 2) {
+	if (mirror_num > 2)
 		set_rbio_raid6_extra_error(rbio, mirror_num);
-		rbio->failb = rbio->real_stripes - (mirror_num - 1);
-		ASSERT(rbio->failb > 0);
-		if (rbio->failb <= rbio->faila)
-			rbio->failb--;
-	}
 
 	start_async_work(rbio, recover_rbio_work);
 }
@@ -2179,7 +2055,6 @@ static int rmw_read_and_wait(struct btrfs_raid_bio *rbio)
 	int ret;
 
 	bio_list_init(&bio_list);
-	atomic_set(&rbio->error, 0);
 
 	ret = rmw_assemble_read_bios(rbio, &bio_list);
 	if (ret < 0)
@@ -2200,10 +2075,8 @@ static void raid_wait_write_end_io(struct bio *bio)
 	struct btrfs_raid_bio *rbio = bio->bi_private;
 	blk_status_t err = bio->bi_status;
 
-	if (err) {
-		fail_bio_stripe(rbio, bio);
+	if (err)
 		rbio_update_error_bitmap(rbio, bio);
-	}
 	bio_put(bio);
 	if (atomic_dec_and_test(&rbio->stripes_pending))
 		wake_up(&rbio->io_wait);
@@ -2253,19 +2126,14 @@ static int rmw_rbio(struct btrfs_raid_bio *rbio)
 	if (ret < 0)
 		return ret;
 
-	atomic_set(&rbio->error, 0);
 	index_rbio_pages(rbio);
 
 	ret = rmw_read_and_wait(rbio);
 	if (ret < 0)
 		return ret;
 
-	/* Too many read errors, beyond our tolerance. */
-	if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
-		return ret;
-
-	/* Have read failures but under tolerance, needs recovery. */
-	if (rbio->faila >= 0 || rbio->failb >= 0) {
+	/* We have read errors, try recovery path. */
+	if (!bitmap_empty(rbio->error_bitmap, rbio->nr_sectors)) {
 		ret = recover_rbio(rbio);
 		if (ret < 0)
 			return ret;
@@ -2280,7 +2148,6 @@ write:
 	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
 	spin_unlock_irq(&rbio->bio_list_lock);
 
-	atomic_set(&rbio->error, 0);
 	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
 
 	index_rbio_pages(rbio);
@@ -2309,9 +2176,16 @@ write:
 	submit_write_bios(rbio, &bio_list);
 	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
 
-	/* We have more errors than our tolerance during the read. */
-	if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
-		ret = -EIO;
+	/* We may have more errors than our tolerance during the read. */
+	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
+		int found_errors;
+
+		found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL);
+		if (found_errors > rbio->bioc->max_errors) {
+			ret = -EIO;
+			break;
+		}
+	}
 	return ret;
 }
 
@@ -2492,7 +2366,6 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check)
 		pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page);
 	}
 
-	atomic_set(&rbio->error, 0);
 	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
 
 	/* Map the parity stripe just once */
@@ -2726,6 +2599,7 @@ static int scrub_rbio(struct btrfs_raid_bio *rbio)
 {
 	bool need_check = false;
 	struct bio_list bio_list;
+	int sector_nr;
 	int ret;
 	struct bio *bio;
 
@@ -2735,7 +2609,6 @@ static int scrub_rbio(struct btrfs_raid_bio *rbio)
 	if (ret)
 		goto cleanup;
 
-	atomic_set(&rbio->error, 0);
 	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
 
 	ret = scrub_assemble_read_bios(rbio, &bio_list);
@@ -2756,8 +2629,15 @@ static int scrub_rbio(struct btrfs_raid_bio *rbio)
 	 */
 	ret = finish_parity_scrub(rbio, need_check);
 	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
-	if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
-		ret = -EIO;
+	for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
+		int found_errors;
+
+		found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL);
+		if (found_errors > rbio->bioc->max_errors) {
+			ret = -EIO;
+			break;
+		}
+	}
 	return ret;
 
 cleanup:
@@ -2804,14 +2684,6 @@ raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc)
 	ASSERT(!bio->bi_iter.bi_size);
 
 	set_rbio_range_error(rbio, bio);
-	rbio->faila = find_logical_bio_stripe(rbio, bio);
-	if (rbio->faila == -1) {
-		btrfs_warn_rl(fs_info,
-	"can not determine the failed stripe number for full stripe %llu",
-			      bioc->raid_map[0]);
-		free_raid_bio(rbio);
-		return NULL;
-	}
 
 	return rbio;
 }
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index e38da4fa76d6..a2e653e93fd8 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -74,12 +74,6 @@ struct btrfs_raid_bio {
 	/* How many sectors there are for each stripe */
 	u8 stripe_nsectors;
 
-	/* First bad stripe, -1 means no corruption */
-	s8 faila;
-
-	/* Second bad stripe (for RAID6 use) */
-	s8 failb;
-
 	/* Stripe number that we're scrubbing  */
 	u8 scrubp;
 
@@ -93,8 +87,6 @@ struct btrfs_raid_bio {
 
 	atomic_t stripes_pending;
 
-	atomic_t error;
-
 	wait_queue_head_t io_wait;
 
 	/* Bitmap to record which horizontal stripe has data */
-- 
cgit 


From 3e09b5b2293f21e6e28929b6bbb73833678bfdd1 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 7 Nov 2022 17:30:21 +0100
Subject: btrfs: constify input buffer parameter in compression code

The input buffers passed down to compression must never be changed,
switch type to u8 as it's a raw byte buffer and use const.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c | 4 ++--
 fs/btrfs/compression.h | 8 ++++----
 fs/btrfs/lzo.c         | 2 +-
 fs/btrfs/zlib.c        | 2 +-
 fs/btrfs/zstd.c        | 2 +-
 5 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 42e6dde2ad59..30adf430241e 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -119,7 +119,7 @@ static int compression_decompress_bio(struct list_head *ws,
 }
 
 static int compression_decompress(int type, struct list_head *ws,
-               unsigned char *data_in, struct page *dest_page,
+               const u8 *data_in, struct page *dest_page,
                unsigned long start_byte, size_t srclen, size_t destlen)
 {
 	switch (type) {
@@ -1232,7 +1232,7 @@ static int btrfs_decompress_bio(struct compressed_bio *cb)
  * single page, and we want to read a single page out of it.
  * start_byte tells us the offset into the compressed data we're interested in
  */
-int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
+int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
 		     unsigned long start_byte, size_t srclen, size_t destlen)
 {
 	struct list_head *workspace;
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index b961462399ae..6209d40a1e08 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -86,7 +86,7 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
 			 unsigned long *out_pages,
 			 unsigned long *total_in,
 			 unsigned long *total_out);
-int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
+int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
 		     unsigned long start_byte, size_t srclen, size_t destlen);
 int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
 			      struct compressed_bio *cb, u32 decompressed);
@@ -150,7 +150,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
 		u64 start, struct page **pages, unsigned long *out_pages,
 		unsigned long *total_in, unsigned long *total_out);
 int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
-int zlib_decompress(struct list_head *ws, unsigned char *data_in,
+int zlib_decompress(struct list_head *ws, const u8 *data_in,
 		struct page *dest_page, unsigned long start_byte, size_t srclen,
 		size_t destlen);
 struct list_head *zlib_alloc_workspace(unsigned int level);
@@ -161,7 +161,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
 		u64 start, struct page **pages, unsigned long *out_pages,
 		unsigned long *total_in, unsigned long *total_out);
 int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
-int lzo_decompress(struct list_head *ws, unsigned char *data_in,
+int lzo_decompress(struct list_head *ws, const u8 *data_in,
 		struct page *dest_page, unsigned long start_byte, size_t srclen,
 		size_t destlen);
 struct list_head *lzo_alloc_workspace(unsigned int level);
@@ -171,7 +171,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
 		u64 start, struct page **pages, unsigned long *out_pages,
 		unsigned long *total_in, unsigned long *total_out);
 int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
-int zstd_decompress(struct list_head *ws, unsigned char *data_in,
+int zstd_decompress(struct list_head *ws, const u8 *data_in,
 		struct page *dest_page, unsigned long start_byte, size_t srclen,
 		size_t destlen);
 void zstd_init_workspace_manager(void);
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index e7b1ceffcd33..d5e78cbc8fbc 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -427,7 +427,7 @@ out:
 	return ret;
 }
 
-int lzo_decompress(struct list_head *ws, unsigned char *data_in,
+int lzo_decompress(struct list_head *ws, const u8 *data_in,
 		struct page *dest_page, unsigned long start_byte, size_t srclen,
 		size_t destlen)
 {
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index c5275cb23837..01a13de11832 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -355,7 +355,7 @@ done:
 	return ret;
 }
 
-int zlib_decompress(struct list_head *ws, unsigned char *data_in,
+int zlib_decompress(struct list_head *ws, const u8 *data_in,
 		struct page *dest_page, unsigned long start_byte, size_t srclen,
 		size_t destlen)
 {
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 4575b3703e74..e34f1ab99d56 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -616,7 +616,7 @@ done:
 	return ret;
 }
 
-int zstd_decompress(struct list_head *ws, unsigned char *data_in,
+int zstd_decompress(struct list_head *ws, const u8 *data_in,
 		struct page *dest_page, unsigned long start_byte, size_t srclen,
 		size_t destlen)
 {
-- 
cgit 


From bb21e30260a672172a26ee1626dc1463215cf18c Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Mon, 7 Nov 2022 23:07:17 +0800
Subject: btrfs: move device->name RCU allocation and assign to
 btrfs_alloc_device()

There is a repeating code section in the parent function after calling
btrfs_alloc_device(), as below:

      name = rcu_string_strdup(path, GFP_...);
      if (!name) {
              btrfs_free_device(device);
              return ERR_PTR(-ENOMEM);
      }
      rcu_assign_pointer(device->name, name);

Except in add_missing_dev() for obvious reasons.

This patch consolidates that repeating code into the btrfs_alloc_device()
itself so that the parent function doesn't have to duplicate code.
This consolidation also helps to review issues regarding RCU lock
violation with device->name.

Parent function device_list_add() and add_missing_dev() use GFP_NOFS for
the allocation, whereas the rest of the parent functions use GFP_KERNEL,
so bring the NOFS allocation context using memalloc_nofs_save() in the
function device_list_add() and add_missing_dev() is already doing it.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/dev-replace.c | 10 +-------
 fs/btrfs/volumes.c     | 68 +++++++++++++++++++++++---------------------------
 fs/btrfs/volumes.h     |  4 +--
 3 files changed, 34 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 84af2010fae2..9c4a8649a0f4 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -249,7 +249,6 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 	struct btrfs_device *device;
 	struct block_device *bdev;
-	struct rcu_string *name;
 	u64 devid = BTRFS_DEV_REPLACE_DEVID;
 	int ret = 0;
 
@@ -293,19 +292,12 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 	}
 
 
-	device = btrfs_alloc_device(NULL, &devid, NULL);
+	device = btrfs_alloc_device(NULL, &devid, NULL, device_path);
 	if (IS_ERR(device)) {
 		ret = PTR_ERR(device);
 		goto error;
 	}
 
-	name = rcu_string_strdup(device_path, GFP_KERNEL);
-	if (!name) {
-		btrfs_free_device(device);
-		ret = -ENOMEM;
-		goto error;
-	}
-	rcu_assign_pointer(device->name, name);
 	ret = lookup_bdev(device_path, &device->devt);
 	if (ret)
 		goto error;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index acb97494bb52..49d6e79d4343 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -845,26 +845,23 @@ static noinline struct btrfs_device *device_list_add(const char *path,
 	}
 
 	if (!device) {
+		unsigned int nofs_flag;
+
 		if (fs_devices->opened) {
 			mutex_unlock(&fs_devices->device_list_mutex);
 			return ERR_PTR(-EBUSY);
 		}
 
+		nofs_flag = memalloc_nofs_save();
 		device = btrfs_alloc_device(NULL, &devid,
-					    disk_super->dev_item.uuid);
+					    disk_super->dev_item.uuid, path);
+		memalloc_nofs_restore(nofs_flag);
 		if (IS_ERR(device)) {
 			mutex_unlock(&fs_devices->device_list_mutex);
 			/* we can safely leave the fs_devices entry around */
 			return device;
 		}
 
-		name = rcu_string_strdup(path, GFP_NOFS);
-		if (!name) {
-			btrfs_free_device(device);
-			mutex_unlock(&fs_devices->device_list_mutex);
-			return ERR_PTR(-ENOMEM);
-		}
-		rcu_assign_pointer(device->name, name);
 		device->devt = path_devt;
 
 		list_add_rcu(&device->dev_list, &fs_devices->devices);
@@ -997,30 +994,22 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 	fs_devices->total_devices = orig->total_devices;
 
 	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
-		struct rcu_string *name;
+		const char *dev_path = NULL;
+
+		/*
+		 * This is ok to do without RCU read locked because we hold the
+		 * uuid mutex so nothing we touch in here is going to disappear.
+		 */
+		if (orig_dev->name)
+			dev_path = orig_dev->name->str;
 
 		device = btrfs_alloc_device(NULL, &orig_dev->devid,
-					    orig_dev->uuid);
+					    orig_dev->uuid, dev_path);
 		if (IS_ERR(device)) {
 			ret = PTR_ERR(device);
 			goto error;
 		}
 
-		/*
-		 * This is ok to do without rcu read locked because we hold the
-		 * uuid mutex so nothing we touch in here is going to disappear.
-		 */
-		if (orig_dev->name) {
-			name = rcu_string_strdup(orig_dev->name->str,
-					GFP_KERNEL);
-			if (!name) {
-				btrfs_free_device(device);
-				ret = -ENOMEM;
-				goto error;
-			}
-			rcu_assign_pointer(device->name, name);
-		}
-
 		if (orig_dev->zone_info) {
 			struct btrfs_zoned_device_info *zone_info;
 
@@ -2604,7 +2593,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	struct btrfs_device *device;
 	struct block_device *bdev;
 	struct super_block *sb = fs_info->sb;
-	struct rcu_string *name;
 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 	struct btrfs_fs_devices *seed_devices;
 	u64 orig_super_total_bytes;
@@ -2645,20 +2633,13 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	}
 	rcu_read_unlock();
 
-	device = btrfs_alloc_device(fs_info, NULL, NULL);
+	device = btrfs_alloc_device(fs_info, NULL, NULL, device_path);
 	if (IS_ERR(device)) {
 		/* we can safely leave the fs_devices entry around */
 		ret = PTR_ERR(device);
 		goto error;
 	}
 
-	name = rcu_string_strdup(device_path, GFP_KERNEL);
-	if (!name) {
-		ret = -ENOMEM;
-		goto error_free_device;
-	}
-	rcu_assign_pointer(device->name, name);
-
 	device->fs_info = fs_info;
 	device->bdev = bdev;
 	ret = lookup_bdev(device_path, &device->devt);
@@ -6998,8 +6979,9 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
 	 * always do NOFS because we use it in a lot of other GFP_KERNEL safe
 	 * places.
 	 */
+
 	nofs_flag = memalloc_nofs_save();
-	device = btrfs_alloc_device(NULL, &devid, dev_uuid);
+	device = btrfs_alloc_device(NULL, &devid, dev_uuid, NULL);
 	memalloc_nofs_restore(nofs_flag);
 	if (IS_ERR(device))
 		return device;
@@ -7023,14 +7005,15 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
  *		is generated.
  * @uuid:	a pointer to UUID for this device.  If NULL a new UUID
  *		is generated.
+ * @path:	a pointer to device path if available, NULL otherwise.
  *
  * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
  * on error.  Returned struct is not linked onto any lists and must be
  * destroyed with btrfs_free_device.
  */
 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
-					const u64 *devid,
-					const u8 *uuid)
+					const u64 *devid, const u8 *uuid,
+					const char *path)
 {
 	struct btrfs_device *dev;
 	u64 tmp;
@@ -7068,6 +7051,17 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
 	else
 		generate_random_uuid(dev->uuid);
 
+	if (path) {
+		struct rcu_string *name;
+
+		name = rcu_string_strdup(path, GFP_KERNEL);
+		if (!name) {
+			btrfs_free_device(dev);
+			return ERR_PTR(-ENOMEM);
+		}
+		rcu_assign_pointer(dev->name, name);
+	}
+
 	return dev;
 }
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index b05124e62412..f2a152937cd4 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -649,8 +649,8 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
 				 struct btrfs_dev_lookup_args *args,
 				 const char *path);
 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
-					const u64 *devid,
-					const u8 *uuid);
+					const u64 *devid, const u8 *uuid,
+					const char *path);
 void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args);
 void btrfs_free_device(struct btrfs_device *device);
 int btrfs_rm_device(struct btrfs_fs_info *fs_info,
-- 
cgit 


From 789d6a3a876e32c23fc9633d5b372d02a5188f0e Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Wed, 14 Sep 2022 13:32:50 +0800
Subject: btrfs: concentrate all tree block parentness check parameters into
 one structure

There are several different tree block parentness check parameters used
across several helpers:

- level
  Mandatory

- transid
  Under most cases it's mandatory, but there are several backref cases
  which skips this check.

- owner_root
- first_key
  Utilized by most top-down tree search routine. Otherwise can be
  skipped.

Those four members are not always mandatory checks, and some of them are
the same u64, which means if some arguments got swapped compiler will
not catch it.

Furthermore if we're going to further expand the parentness check, we
need to modify quite some helpers just to add one more parameter.

This patch will concentrate all these members into a structure called
btrfs_tree_parent_check, and pass that structure for the following
helpers:

- btrfs_read_extent_buffer()
- read_tree_block()

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c      | 15 ++++++++----
 fs/btrfs/ctree.c        | 28 +++++++++++++---------
 fs/btrfs/disk-io.c      | 63 ++++++++++++++++++++++++++++---------------------
 fs/btrfs/disk-io.h      | 36 ++++++++++++++++++++++++----
 fs/btrfs/extent-tree.c  | 12 ++++++----
 fs/btrfs/print-tree.c   | 14 ++++++-----
 fs/btrfs/qgroup.c       | 18 ++++++++++----
 fs/btrfs/relocation.c   | 11 ++++++---
 fs/btrfs/tree-log.c     | 25 ++++++++++++++------
 fs/btrfs/tree-mod-log.c |  9 +++++--
 10 files changed, 159 insertions(+), 72 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 430974cf3b96..55c072ba6747 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -840,6 +840,8 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
 	struct rb_node *node;
 
 	while ((node = rb_first_cached(&tree->root))) {
+		struct btrfs_tree_parent_check check = { 0 };
+
 		ref = rb_entry(node, struct prelim_ref, rbnode);
 		rb_erase_cached(node, &tree->root);
 
@@ -847,8 +849,10 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
 		BUG_ON(ref->key_for_search.type);
 		BUG_ON(!ref->wanted_disk_byte);
 
-		eb = read_tree_block(fs_info, ref->wanted_disk_byte,
-				     ref->root_id, 0, ref->level - 1, NULL);
+		check.level = ref->level - 1;
+		check.owner_root = ref->root_id;
+
+		eb = read_tree_block(fs_info, ref->wanted_disk_byte, &check);
 		if (IS_ERR(eb)) {
 			free_pref(ref);
 			return PTR_ERR(eb);
@@ -1591,10 +1595,13 @@ again:
 		if (ref->count && ref->parent) {
 			if (!ctx->ignore_extent_item_pos && !ref->inode_list &&
 			    ref->level == 0) {
+				struct btrfs_tree_parent_check check = { 0 };
 				struct extent_buffer *eb;
 
-				eb = read_tree_block(ctx->fs_info, ref->parent, 0,
-						     0, ref->level, NULL);
+				check.level = ref->level;
+
+				eb = read_tree_block(ctx->fs_info, ref->parent,
+						     &check);
 				if (IS_ERR(eb)) {
 					ret = PTR_ERR(eb);
 					goto out;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0acd85111cdf..f75e398d7b71 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -857,19 +857,22 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
 					   int slot)
 {
 	int level = btrfs_header_level(parent);
+	struct btrfs_tree_parent_check check = { 0 };
 	struct extent_buffer *eb;
-	struct btrfs_key first_key;
 
 	if (slot < 0 || slot >= btrfs_header_nritems(parent))
 		return ERR_PTR(-ENOENT);
 
 	BUG_ON(level == 0);
 
-	btrfs_node_key_to_cpu(parent, &first_key, slot);
+	check.level = level - 1;
+	check.transid = btrfs_node_ptr_generation(parent, slot);
+	check.owner_root = btrfs_header_owner(parent);
+	check.has_first_key = true;
+	btrfs_node_key_to_cpu(parent, &check.first_key, slot);
+
 	eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
-			     btrfs_header_owner(parent),
-			     btrfs_node_ptr_generation(parent, slot),
-			     level - 1, &first_key);
+			     &check);
 	if (IS_ERR(eb))
 		return eb;
 	if (!extent_buffer_uptodate(eb)) {
@@ -1428,10 +1431,10 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 		      const struct btrfs_key *key)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_tree_parent_check check = { 0 };
 	u64 blocknr;
 	u64 gen;
 	struct extent_buffer *tmp;
-	struct btrfs_key first_key;
 	int ret;
 	int parent_level;
 	bool unlock_up;
@@ -1440,7 +1443,11 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 	blocknr = btrfs_node_blockptr(*eb_ret, slot);
 	gen = btrfs_node_ptr_generation(*eb_ret, slot);
 	parent_level = btrfs_header_level(*eb_ret);
-	btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
+	btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot);
+	check.has_first_key = true;
+	check.level = parent_level - 1;
+	check.transid = gen;
+	check.owner_root = root->root_key.objectid;
 
 	/*
 	 * If we need to read an extent buffer from disk and we are holding locks
@@ -1462,7 +1469,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 			 * parents (shared tree blocks).
 			 */
 			if (btrfs_verify_level_key(tmp,
-					parent_level - 1, &first_key, gen)) {
+					parent_level - 1, &check.first_key, gen)) {
 				free_extent_buffer(tmp);
 				return -EUCLEAN;
 			}
@@ -1479,7 +1486,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 			btrfs_unlock_up_safe(p, level + 1);
 
 		/* now we're allowed to do a blocking uptodate check */
-		ret = btrfs_read_extent_buffer(tmp, gen, parent_level - 1, &first_key);
+		ret = btrfs_read_extent_buffer(tmp, &check);
 		if (ret) {
 			free_extent_buffer(tmp);
 			btrfs_release_path(p);
@@ -1509,8 +1516,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 	if (p->reada != READA_NONE)
 		reada_for_search(fs_info, p, level, slot, key->objectid);
 
-	tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid,
-			      gen, parent_level - 1, &first_key);
+	tmp = read_tree_block(fs_info, blocknr, &check);
 	if (IS_ERR(tmp)) {
 		btrfs_release_path(p);
 		return PTR_ERR(tmp);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e602e0b4c25d..2f944a7c70d5 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -259,13 +259,11 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level,
  * helper to read a given tree block, doing retries as required when
  * the checksums don't match and we have alternate mirrors to try.
  *
- * @parent_transid:	expected transid, skip check if 0
- * @level:		expected level, mandatory check
- * @first_key:		expected key of first slot, skip check if NULL
+ * @check:		expected tree parentness check, see the comments of the
+ *			structure for details.
  */
 int btrfs_read_extent_buffer(struct extent_buffer *eb,
-			     u64 parent_transid, int level,
-			     struct btrfs_key *first_key)
+			     struct btrfs_tree_parent_check *check)
 {
 	struct btrfs_fs_info *fs_info = eb->fs_info;
 	struct extent_io_tree *io_tree;
@@ -275,16 +273,19 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb,
 	int mirror_num = 0;
 	int failed_mirror = 0;
 
+	ASSERT(check);
+
 	io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
 	while (1) {
 		clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
 		ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num);
 		if (!ret) {
-			if (verify_parent_transid(io_tree, eb,
-						   parent_transid, 0))
+			if (verify_parent_transid(io_tree, eb, check->transid, 0))
 				ret = -EIO;
-			else if (btrfs_verify_level_key(eb, level,
-						first_key, parent_transid))
+			else if (btrfs_verify_level_key(eb, check->level,
+						check->has_first_key ?
+						&check->first_key : NULL,
+						check->transid))
 				ret = -EUCLEAN;
 			else
 				break;
@@ -936,28 +937,28 @@ struct extent_buffer *btrfs_find_create_tree_block(
  * Read tree block at logical address @bytenr and do variant basic but critical
  * verification.
  *
- * @owner_root:		the objectid of the root owner for this block.
- * @parent_transid:	expected transid of this tree block, skip check if 0
- * @level:		expected level, mandatory check
- * @first_key:		expected key in slot 0, skip check if NULL
+ * @check:		expected tree parentness check, see comments of the
+ *			structure for details.
  */
 struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
-				      u64 owner_root, u64 parent_transid,
-				      int level, struct btrfs_key *first_key)
+				      struct btrfs_tree_parent_check *check)
 {
 	struct extent_buffer *buf = NULL;
 	int ret;
 
-	buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
+	ASSERT(check);
+
+	buf = btrfs_find_create_tree_block(fs_info, bytenr, check->owner_root,
+					   check->level);
 	if (IS_ERR(buf))
 		return buf;
 
-	ret = btrfs_read_extent_buffer(buf, parent_transid, level, first_key);
+	ret = btrfs_read_extent_buffer(buf, check);
 	if (ret) {
 		free_extent_buffer_stale(buf);
 		return ERR_PTR(ret);
 	}
-	if (btrfs_check_eb_owner(buf, owner_root)) {
+	if (btrfs_check_eb_owner(buf, check->owner_root)) {
 		free_extent_buffer_stale(buf);
 		return ERR_PTR(-EUCLEAN);
 	}
@@ -1373,6 +1374,7 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
 					      struct btrfs_key *key)
 {
 	struct btrfs_root *root;
+	struct btrfs_tree_parent_check check = { 0 };
 	struct btrfs_fs_info *fs_info = tree_root->fs_info;
 	u64 generation;
 	int ret;
@@ -1392,9 +1394,11 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
 
 	generation = btrfs_root_generation(&root->root_item);
 	level = btrfs_root_level(&root->root_item);
-	root->node = read_tree_block(fs_info,
-				     btrfs_root_bytenr(&root->root_item),
-				     key->objectid, generation, level, NULL);
+	check.level = level;
+	check.transid = generation;
+	check.owner_root = key->objectid;
+	root->node = read_tree_block(fs_info, btrfs_root_bytenr(&root->root_item),
+				     &check);
 	if (IS_ERR(root->node)) {
 		ret = PTR_ERR(root->node);
 		root->node = NULL;
@@ -2367,6 +2371,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
 			    struct btrfs_fs_devices *fs_devices)
 {
 	int ret;
+	struct btrfs_tree_parent_check check = { 0 };
 	struct btrfs_root *log_tree_root;
 	struct btrfs_super_block *disk_super = fs_info->super_copy;
 	u64 bytenr = btrfs_super_log_root(disk_super);
@@ -2382,10 +2387,10 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
 	if (!log_tree_root)
 		return -ENOMEM;
 
-	log_tree_root->node = read_tree_block(fs_info, bytenr,
-					      BTRFS_TREE_LOG_OBJECTID,
-					      fs_info->generation + 1, level,
-					      NULL);
+	check.level = level;
+	check.transid = fs_info->generation + 1;
+	check.owner_root = BTRFS_TREE_LOG_OBJECTID;
+	log_tree_root->node = read_tree_block(fs_info, bytenr, &check);
 	if (IS_ERR(log_tree_root->node)) {
 		btrfs_warn(fs_info, "failed to read log tree");
 		ret = PTR_ERR(log_tree_root->node);
@@ -2863,10 +2868,14 @@ out:
 
 static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int level)
 {
+	struct btrfs_tree_parent_check check = {
+		.level = level,
+		.transid = gen,
+		.owner_root = root->root_key.objectid
+	};
 	int ret = 0;
 
-	root->node = read_tree_block(root->fs_info, bytenr,
-				     root->root_key.objectid, gen, level, NULL);
+	root->node = read_tree_block(root->fs_info, bytenr, &check);
 	if (IS_ERR(root->node)) {
 		ret = PTR_ERR(root->node);
 		root->node = NULL;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index f2c507fd0e04..03fe4154ffb8 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -25,6 +25,35 @@ static inline u64 btrfs_sb_offset(int mirror)
 	return BTRFS_SUPER_INFO_OFFSET;
 }
 
+/* All the extra info needed to verify the parentness of a tree block. */
+struct btrfs_tree_parent_check {
+	/*
+	 * The owner check against the tree block.
+	 *
+	 * Can be 0 to skip the owner check.
+	 */
+	u64 owner_root;
+
+	/*
+	 * Expected transid, can be 0 to skip the check, but such skip
+	 * should only be utlized for backref walk related code.
+	 */
+	u64 transid;
+
+	/*
+	 * The expected first key.
+	 *
+	 * This check can be skipped if @has_first_key is false, such skip
+	 * can happen for case where we don't have the parent node key,
+	 * e.g. reading the tree root, doing backref walk.
+	 */
+	struct btrfs_key first_key;
+	bool has_first_key;
+
+	/* The expected level. Should always be set. */
+	u8 level;
+};
+
 struct btrfs_device;
 struct btrfs_fs_devices;
 
@@ -33,8 +62,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info);
 int btrfs_verify_level_key(struct extent_buffer *eb, int level,
 			   struct btrfs_key *first_key, u64 parent_transid);
 struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
-				      u64 owner_root, u64 parent_transid,
-				      int level, struct btrfs_key *first_key);
+				      struct btrfs_tree_parent_check *check);
 struct extent_buffer *btrfs_find_create_tree_block(
 						struct btrfs_fs_info *fs_info,
 						u64 bytenr, u64 owner_root,
@@ -111,8 +139,8 @@ void btrfs_put_root(struct btrfs_root *root);
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
 			  int atomic);
-int btrfs_read_extent_buffer(struct extent_buffer *buf, u64 parent_transid,
-			     int level, struct btrfs_key *first_key);
+int btrfs_read_extent_buffer(struct extent_buffer *buf,
+			     struct btrfs_tree_parent_check *check);
 
 enum btrfs_wq_submit_cmd {
 	WQ_SUBMIT_METADATA,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b037107678c8..10cc757a602b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5261,8 +5261,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 	u64 bytenr;
 	u64 generation;
 	u64 parent;
+	struct btrfs_tree_parent_check check = { 0 };
 	struct btrfs_key key;
-	struct btrfs_key first_key;
 	struct btrfs_ref ref = { 0 };
 	struct extent_buffer *next;
 	int level = wc->level;
@@ -5284,7 +5284,12 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 	}
 
 	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
-	btrfs_node_key_to_cpu(path->nodes[level], &first_key,
+
+	check.level = level - 1;
+	check.transid = generation;
+	check.owner_root = root->root_key.objectid;
+	check.has_first_key = true;
+	btrfs_node_key_to_cpu(path->nodes[level], &check.first_key,
 			      path->slots[level]);
 
 	next = find_extent_buffer(fs_info, bytenr);
@@ -5346,8 +5351,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 	if (!next) {
 		if (reada && level == 1)
 			reada_walk_down(trans, root, wc, path);
-		next = read_tree_block(fs_info, bytenr, root->root_key.objectid,
-				       generation, level - 1, &first_key);
+		next = read_tree_block(fs_info, bytenr, &check);
 		if (IS_ERR(next)) {
 			return PTR_ERR(next);
 		} else if (!extent_buffer_uptodate(next)) {
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 1a2350fd68be..1469aa55ad48 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -386,14 +386,16 @@ void btrfs_print_tree(struct extent_buffer *c, bool follow)
 	if (!follow)
 		return;
 	for (i = 0; i < nr; i++) {
-		struct btrfs_key first_key;
+		struct btrfs_tree_parent_check check = {
+			.level = level - 1,
+			.transid = btrfs_node_ptr_generation(c, i),
+			.owner_root = btrfs_header_owner(c),
+			.has_first_key = true
+		};
 		struct extent_buffer *next;
 
-		btrfs_node_key_to_cpu(c, &first_key, i);
-		next = read_tree_block(fs_info, btrfs_node_blockptr(c, i),
-				       btrfs_header_owner(c),
-				       btrfs_node_ptr_generation(c, i),
-				       level - 1, &first_key);
+		btrfs_node_key_to_cpu(c, &check.first_key, i);
+		next = read_tree_block(fs_info, btrfs_node_blockptr(c, i), &check);
 		if (IS_ERR(next))
 			continue;
 		if (!extent_buffer_uptodate(next)) {
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 24c013c61a94..e0522c6c0d67 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2339,7 +2339,13 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
 	}
 
 	if (!extent_buffer_uptodate(root_eb)) {
-		ret = btrfs_read_extent_buffer(root_eb, root_gen, root_level, NULL);
+		struct btrfs_tree_parent_check check = {
+			.has_first_key = false,
+			.transid = root_gen,
+			.level = root_level
+		};
+
+		ret = btrfs_read_extent_buffer(root_eb, &check);
 		if (ret)
 			goto out;
 	}
@@ -4303,6 +4309,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
 					 struct extent_buffer *subvol_eb)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_tree_parent_check check = { 0 };
 	struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
 	struct btrfs_qgroup_swapped_block *block;
 	struct extent_buffer *reloc_eb = NULL;
@@ -4351,10 +4358,13 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
 	blocks->swapped = swapped;
 	spin_unlock(&blocks->lock);
 
+	check.level = block->level;
+	check.transid = block->reloc_generation;
+	check.has_first_key = true;
+	memcpy(&check.first_key, &block->first_key, sizeof(check.first_key));
+
 	/* Read out reloc subtree root */
-	reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, 0,
-				   block->reloc_generation, block->level,
-				   &block->first_key);
+	reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, &check);
 	if (IS_ERR(reloc_eb)) {
 		ret = PTR_ERR(reloc_eb);
 		reloc_eb = NULL;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 8914aa920bb7..56c8afa6f6d2 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2610,10 +2610,14 @@ static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
 static int get_tree_block_key(struct btrfs_fs_info *fs_info,
 			      struct tree_block *block)
 {
+	struct btrfs_tree_parent_check check = {
+		.level = block->level,
+		.owner_root = block->owner,
+		.transid = block->key.offset
+	};
 	struct extent_buffer *eb;
 
-	eb = read_tree_block(fs_info, block->bytenr, block->owner,
-			     block->key.offset, block->level, NULL);
+	eb = read_tree_block(fs_info, block->bytenr, &check);
 	if (IS_ERR(eb))
 		return PTR_ERR(eb);
 	if (!extent_buffer_uptodate(eb)) {
@@ -3426,9 +3430,10 @@ int add_data_references(struct reloc_control *rc,
 
 	ULIST_ITER_INIT(&leaf_uiter);
 	while ((ref_node = ulist_next(ctx.refs, &leaf_uiter))) {
+		struct btrfs_tree_parent_check check = { 0 };
 		struct extent_buffer *eb;
 
-		eb = read_tree_block(ctx.fs_info, ref_node->val, 0, 0, 0, NULL);
+		eb = read_tree_block(ctx.fs_info, ref_node->val, &check);
 		if (IS_ERR(eb)) {
 			ret = PTR_ERR(eb);
 			break;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f7b1bb9c63e4..4d9f6803bfbe 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -341,7 +341,12 @@ static int process_one_buffer(struct btrfs_root *log,
 	 * pin down any logged extents, so we have to read the block.
 	 */
 	if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
-		ret = btrfs_read_extent_buffer(eb, gen, level, NULL);
+		struct btrfs_tree_parent_check check = {
+			.level = level,
+			.transid = gen
+		};
+
+		ret = btrfs_read_extent_buffer(eb, &check);
 		if (ret)
 			return ret;
 	}
@@ -2411,13 +2416,17 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 			     struct walk_control *wc, u64 gen, int level)
 {
 	int nritems;
+	struct btrfs_tree_parent_check check = {
+		.transid = gen,
+		.level = level
+	};
 	struct btrfs_path *path;
 	struct btrfs_root *root = wc->replay_dest;
 	struct btrfs_key key;
 	int i;
 	int ret;
 
-	ret = btrfs_read_extent_buffer(eb, gen, level, NULL);
+	ret = btrfs_read_extent_buffer(eb, &check);
 	if (ret)
 		return ret;
 
@@ -2597,7 +2606,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 	int ret = 0;
 
 	while (*level > 0) {
-		struct btrfs_key first_key;
+		struct btrfs_tree_parent_check check = { 0 };
 
 		cur = path->nodes[*level];
 
@@ -2609,7 +2618,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 
 		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
 		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
-		btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
+		check.transid = ptr_gen;
+		check.level = *level - 1;
+		check.has_first_key = true;
+		btrfs_node_key_to_cpu(cur, &check.first_key, path->slots[*level]);
 		blocksize = fs_info->nodesize;
 
 		next = btrfs_find_create_tree_block(fs_info, bytenr,
@@ -2628,8 +2640,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 
 			path->slots[*level]++;
 			if (wc->free) {
-				ret = btrfs_read_extent_buffer(next, ptr_gen,
-							*level - 1, &first_key);
+				ret = btrfs_read_extent_buffer(next, &check);
 				if (ret) {
 					free_extent_buffer(next);
 					return ret;
@@ -2657,7 +2668,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 			free_extent_buffer(next);
 			continue;
 		}
-		ret = btrfs_read_extent_buffer(next, ptr_gen, *level - 1, &first_key);
+		ret = btrfs_read_extent_buffer(next, &check);
 		if (ret) {
 			free_extent_buffer(next);
 			return ret;
diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c
index 3bea19698a10..779ad44d285f 100644
--- a/fs/btrfs/tree-mod-log.c
+++ b/fs/btrfs/tree-mod-log.c
@@ -821,10 +821,15 @@ struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq)
 
 	tm = tree_mod_log_search(fs_info, logical, time_seq);
 	if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
+		struct btrfs_tree_parent_check check = { 0 };
+
 		btrfs_tree_read_unlock(eb_root);
 		free_extent_buffer(eb_root);
-		old = read_tree_block(fs_info, logical, root->root_key.objectid,
-				      0, level, NULL);
+
+		check.level = level;
+		check.owner_root = root->root_key.objectid;
+
+		old = read_tree_block(fs_info, logical, &check);
 		if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
 			if (!IS_ERR(old))
 				free_extent_buffer(old);
-- 
cgit 


From 947a629988f191807d2d22ba63ae18259bb645c5 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Wed, 14 Sep 2022 13:32:51 +0800
Subject: btrfs: move tree block parentness check into validate_extent_buffer()

[BACKGROUND]
Although both btrfs metadata and data has their read time verification
done at endio time (btrfs_validate_metadata_buffer() and
btrfs_verify_data_csum()), metadata has extra verification, mostly
parentness check including first key/transid/owner_root/level, done at
read_tree_block() and btrfs_read_extent_buffer().

On the other hand, all the data verification is done at endio context.

[ENHANCEMENT]
This patch will make a new union in btrfs_bio, taking the space of the
old data checksums, thus it will not increase the memory usage.

With that extra btrfs_tree_parent_check inside btrfs_bio, we can just
pass the check parameter into read_extent_buffer_pages(), and before
submitting the bio, we can copy the check structure into btrfs_bio.

And finally at endio time, we can grab btrfs_bio::parent_check and pass
it to validate_extent_buffer(), to move the remaining checks into it.

This brings the following benefits:

- Much simpler btrfs_read_extent_buffer()
  Now it only needs to iterate through all mirrors.

- Simpler read-time transid check
  Previously we go verify_parent_transid() after reading out the extent
  buffer.
  Now the transid check is done inside the endio function, no other
  code can modify the content.
  Thus no need to use the extent lock anymore.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c   | 73 ++++++++++++++++++++++++++++++++++++++--------------
 fs/btrfs/extent_io.c | 18 ++++++++++---
 fs/btrfs/extent_io.h |  5 ++--
 fs/btrfs/volumes.h   | 25 +++++++++++++++---
 4 files changed, 93 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2f944a7c70d5..559e7f3d727e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -266,7 +266,6 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb,
 			     struct btrfs_tree_parent_check *check)
 {
 	struct btrfs_fs_info *fs_info = eb->fs_info;
-	struct extent_io_tree *io_tree;
 	int failed = 0;
 	int ret;
 	int num_copies = 0;
@@ -275,21 +274,11 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb,
 
 	ASSERT(check);
 
-	io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
 	while (1) {
 		clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
-		ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num);
-		if (!ret) {
-			if (verify_parent_transid(io_tree, eb, check->transid, 0))
-				ret = -EIO;
-			else if (btrfs_verify_level_key(eb, check->level,
-						check->has_first_key ?
-						&check->first_key : NULL,
-						check->transid))
-				ret = -EUCLEAN;
-			else
-				break;
-		}
+		ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num, check);
+		if (!ret)
+			break;
 
 		num_copies = btrfs_num_copies(fs_info,
 					      eb->start, eb->len);
@@ -465,7 +454,8 @@ static int check_tree_block_fsid(struct extent_buffer *eb)
 }
 
 /* Do basic extent buffer checks at read time */
-static int validate_extent_buffer(struct extent_buffer *eb)
+static int validate_extent_buffer(struct extent_buffer *eb,
+				  struct btrfs_tree_parent_check *check)
 {
 	struct btrfs_fs_info *fs_info = eb->fs_info;
 	u64 found_start;
@@ -475,6 +465,8 @@ static int validate_extent_buffer(struct extent_buffer *eb)
 	const u8 *header_csum;
 	int ret = 0;
 
+	ASSERT(check);
+
 	found_start = btrfs_header_bytenr(eb);
 	if (found_start != eb->start) {
 		btrfs_err_rl(fs_info,
@@ -513,6 +505,45 @@ static int validate_extent_buffer(struct extent_buffer *eb)
 		goto out;
 	}
 
+	if (found_level != check->level) {
+		ret = -EIO;
+		goto out;
+	}
+	if (unlikely(check->transid &&
+		     btrfs_header_generation(eb) != check->transid)) {
+		btrfs_err_rl(eb->fs_info,
+"parent transid verify failed on logical %llu mirror %u wanted %llu found %llu",
+				eb->start, eb->read_mirror, check->transid,
+				btrfs_header_generation(eb));
+		ret = -EIO;
+		goto out;
+	}
+	if (check->has_first_key) {
+		struct btrfs_key *expect_key = &check->first_key;
+		struct btrfs_key found_key;
+
+		if (found_level)
+			btrfs_node_key_to_cpu(eb, &found_key, 0);
+		else
+			btrfs_item_key_to_cpu(eb, &found_key, 0);
+		if (unlikely(btrfs_comp_cpu_keys(expect_key, &found_key))) {
+			btrfs_err(fs_info,
+"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
+				  eb->start, check->transid,
+				  expect_key->objectid,
+				  expect_key->type, expect_key->offset,
+				  found_key.objectid, found_key.type,
+				  found_key.offset);
+			ret = -EUCLEAN;
+			goto out;
+		}
+	}
+	if (check->owner_root) {
+		ret = btrfs_check_eb_owner(eb, check->owner_root);
+		if (ret < 0)
+			goto out;
+	}
+
 	/*
 	 * If this is a leaf block and it is corrupt, set the corrupt bit so
 	 * that we don't try and read the other copies of this block, just
@@ -537,13 +568,15 @@ out:
 }
 
 static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
-				   int mirror)
+				   int mirror, struct btrfs_tree_parent_check *check)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
 	struct extent_buffer *eb;
 	bool reads_done;
 	int ret = 0;
 
+	ASSERT(check);
+
 	/*
 	 * We don't allow bio merge for subpage metadata read, so we should
 	 * only get one eb for each endio hook.
@@ -567,7 +600,7 @@ static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
 		ret = -EIO;
 		goto err;
 	}
-	ret = validate_extent_buffer(eb);
+	ret = validate_extent_buffer(eb, check);
 	if (ret < 0)
 		goto err;
 
@@ -597,7 +630,8 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
 	ASSERT(page->private);
 
 	if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
-		return validate_subpage_buffer(page, start, end, mirror);
+		return validate_subpage_buffer(page, start, end, mirror,
+					       &bbio->parent_check);
 
 	eb = (struct extent_buffer *)page->private;
 
@@ -616,7 +650,7 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
 		ret = -EIO;
 		goto err;
 	}
-	ret = validate_extent_buffer(eb);
+	ret = validate_extent_buffer(eb, &bbio->parent_check);
 err:
 	if (ret) {
 		/*
@@ -774,6 +808,7 @@ void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int m
 	blk_status_t ret;
 
 	bio->bi_opf |= REQ_META;
+	bbio->is_metadata = 1;
 
 	if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
 		btrfs_submit_bio(fs_info, bio, mirror_num);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 859a41624c31..d257879edaba 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4934,7 +4934,8 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb)
 }
 
 static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
-				      int mirror_num)
+				      int mirror_num,
+				      struct btrfs_tree_parent_check *check)
 {
 	struct btrfs_fs_info *fs_info = eb->fs_info;
 	struct extent_io_tree *io_tree;
@@ -4947,6 +4948,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
 
 	ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags));
 	ASSERT(PagePrivate(page));
+	ASSERT(check);
 	io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
 
 	if (wait == WAIT_NONE) {
@@ -4990,6 +4992,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
 		 */
 		atomic_dec(&eb->io_pages);
 	}
+	memcpy(&btrfs_bio(bio_ctrl.bio)->parent_check, check, sizeof(*check));
 	submit_one_bio(&bio_ctrl);
 	if (ret || wait != WAIT_COMPLETE) {
 		free_extent_state(cached_state);
@@ -5003,7 +5006,8 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
 	return ret;
 }
 
-int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
+int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
+			     struct btrfs_tree_parent_check *check)
 {
 	int i;
 	struct page *page;
@@ -5029,7 +5033,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
 		return -EIO;
 
 	if (eb->fs_info->nodesize < PAGE_SIZE)
-		return read_extent_buffer_subpage(eb, wait, mirror_num);
+		return read_extent_buffer_subpage(eb, wait, mirror_num, check);
 
 	num_pages = num_extent_pages(eb);
 	for (i = 0; i < num_pages; i++) {
@@ -5106,6 +5110,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
 		}
 	}
 
+	memcpy(&btrfs_bio(bio_ctrl.bio)->parent_check, check, sizeof(*check));
 	submit_one_bio(&bio_ctrl);
 
 	if (ret || wait != WAIT_COMPLETE)
@@ -5841,6 +5846,11 @@ int try_release_extent_buffer(struct page *page)
 void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
 				u64 bytenr, u64 owner_root, u64 gen, int level)
 {
+	struct btrfs_tree_parent_check check = {
+		.has_first_key = 0,
+		.level = level,
+		.transid = gen
+	};
 	struct extent_buffer *eb;
 	int ret;
 
@@ -5853,7 +5863,7 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
 		return;
 	}
 
-	ret = read_extent_buffer_pages(eb, WAIT_NONE, 0);
+	ret = read_extent_buffer_pages(eb, WAIT_NONE, 0, &check);
 	if (ret < 0)
 		free_extent_buffer_stale(eb);
 	else
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 805e262811b4..a0bafc7f6c07 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -66,6 +66,7 @@ struct btrfs_inode;
 struct btrfs_fs_info;
 struct io_failure_record;
 struct extent_io_tree;
+struct btrfs_tree_parent_check;
 
 int __init extent_buffer_init_cachep(void);
 void __cold extent_buffer_free_cachep(void);
@@ -170,8 +171,8 @@ void free_extent_buffer_stale(struct extent_buffer *eb);
 #define WAIT_NONE	0
 #define WAIT_COMPLETE	1
 #define WAIT_PAGE_LOCK	2
-int read_extent_buffer_pages(struct extent_buffer *eb, int wait,
-			     int mirror_num);
+int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
+			     struct btrfs_tree_parent_check *parent_check);
 void wait_on_extent_buffer_writeback(struct extent_buffer *eb);
 void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
 				u64 bytenr, u64 owner_root, u64 gen, int level);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index f2a152937cd4..efa6a3d48cd8 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -11,6 +11,7 @@
 #include <linux/btrfs.h>
 #include "async-thread.h"
 #include "messages.h"
+#include "disk-io.h"
 
 #define BTRFS_MAX_DATA_CHUNK_SIZE	(10ULL * SZ_1G)
 
@@ -397,7 +398,15 @@ typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio);
  * Mostly for btrfs specific features like csum and mirror_num.
  */
 struct btrfs_bio {
-	unsigned int mirror_num;
+	unsigned int mirror_num:7;
+
+	/*
+	 * Extra indicator for metadata bios.
+	 * For some btrfs bios they use pages without a mapping, thus
+	 * we can not rely on page->mapping->host to determine if
+	 * it's a metadata bio.
+	 */
+	unsigned int is_metadata:1;
 	struct bvec_iter iter;
 
 	/* for direct I/O */
@@ -405,8 +414,16 @@ struct btrfs_bio {
 
 	/* @device is for stripe IO submission. */
 	struct btrfs_device *device;
-	u8 *csum;
-	u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
+	union {
+		/* For data checksum verification. */
+		struct {
+			u8 *csum;
+			u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
+		};
+
+		/* For metadata parentness verification. */
+		struct btrfs_tree_parent_check parent_check;
+	};
 
 	/* End I/O information supplied to btrfs_bio_alloc */
 	btrfs_bio_end_io_t end_io;
@@ -443,6 +460,8 @@ static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
 
 static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio)
 {
+	if (bbio->is_metadata)
+		return;
 	if (bbio->csum != bbio->csum_inline) {
 		kfree(bbio->csum);
 		bbio->csum = NULL;
-- 
cgit 


From 2c8f5e8cdf0f77670b1a9f72156ad4e82ed323d1 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 11 Nov 2022 11:50:27 +0000
Subject: btrfs: remove leftover setting of EXTENT_UPTODATE state in an inode's
 io_tree

We don't need to set the EXTENT_UPDATE bit in an inode's io_tree to mark a
range as uptodate, we rely on the pages themselves being uptodate - page
reading is not triggered for already uptodate pages. Recently we removed
most use of the EXTENT_UPTODATE for buffered IO with commit 52b029f42751
("btrfs: remove unnecessary EXTENT_UPTODATE state in buffered I/O path"),
but there were a few leftovers, namely when reading from holes and
successfully finishing read repair.

These leftovers are unnecessarily making an inode's tree larger and deeper,
slowing down searches on it. So remove all the leftovers.

This change is part of a patchset that has the goal to make performance
better for applications that use lseek's SEEK_HOLE and SEEK_DATA modes to
iterate over the extents of a file. Two examples are the cp program from
coreutils 9.0+ and the tar program (when using its --sparse / -S option).
A sample test and results are listed in the changelog of the last patch
in the series:

  1/9 btrfs: remove leftover setting of EXTENT_UPTODATE state in an inode's io_tree
  2/9 btrfs: add an early exit when searching for delalloc range for lseek/fiemap
  3/9 btrfs: skip unnecessary delalloc searches during lseek/fiemap
  4/9 btrfs: search for delalloc more efficiently during lseek/fiemap
  5/9 btrfs: remove no longer used btrfs_next_extent_map()
  6/9 btrfs: allow passing a cached state record to count_range_bits()
  7/9 btrfs: update stale comment for count_range_bits()
  8/9 btrfs: use cached state when looking for delalloc ranges with fiemap
  9/9 btrfs: use cached state when looking for delalloc ranges with lseek

Reported-by: Wang Yugui <wangyugui@e16-tech.com>
Link: https://lore.kernel.org/linux-btrfs/20221106073028.71F9.409509F4@e16-tech.com/
Link: https://lore.kernel.org/linux-btrfs/CAL3q7H5NSVicm7nYBJ7x8fFkDpno8z3PYt5aPU43Bajc1H0h1Q@mail.gmail.com/
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-io-tree.h |  7 -------
 fs/btrfs/extent_io.c      | 19 +++----------------
 2 files changed, 3 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index cdee8c0854c8..18ab82f62611 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -216,13 +216,6 @@ static inline int set_extent_new(struct extent_io_tree *tree, u64 start,
 	return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, GFP_NOFS);
 }
 
-static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start,
-		u64 end, struct extent_state **cached_state, gfp_t mask)
-{
-	return set_extent_bit(tree, start, end, EXTENT_UPTODATE,
-			      cached_state, mask);
-}
-
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
 			  u64 *start_ret, u64 *end_ret, u32 bits,
 			  struct extent_state **cached_state);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d257879edaba..33cc3a9a2b68 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -901,14 +901,9 @@ static void end_sector_io(struct page *page, u64 offset, bool uptodate)
 {
 	struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
 	const u32 sectorsize = inode->root->fs_info->sectorsize;
-	struct extent_state *cached = NULL;
 
 	end_page_read(page, uptodate, offset, sectorsize);
-	if (uptodate)
-		set_extent_uptodate(&inode->io_tree, offset,
-				    offset + sectorsize - 1, &cached, GFP_NOFS);
-	unlock_extent(&inode->io_tree, offset, offset + sectorsize - 1,
-		      &cached);
+	unlock_extent(&inode->io_tree, offset, offset + sectorsize - 1, NULL);
 }
 
 static void submit_data_read_repair(struct inode *inode,
@@ -1781,13 +1776,9 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
 
 		ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
 		if (cur >= last_byte) {
-			struct extent_state *cached = NULL;
-
 			iosize = PAGE_SIZE - pg_offset;
 			memzero_page(page, pg_offset, iosize);
-			set_extent_uptodate(tree, cur, cur + iosize - 1,
-					    &cached, GFP_NOFS);
-			unlock_extent(tree, cur, cur + iosize - 1, &cached);
+			unlock_extent(tree, cur, cur + iosize - 1, NULL);
 			end_page_read(page, true, cur, iosize);
 			break;
 		}
@@ -1863,13 +1854,9 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
 
 		/* we've found a hole, just zero and go on */
 		if (block_start == EXTENT_MAP_HOLE) {
-			struct extent_state *cached = NULL;
-
 			memzero_page(page, pg_offset, iosize);
 
-			set_extent_uptodate(tree, cur, cur + iosize - 1,
-					    &cached, GFP_NOFS);
-			unlock_extent(tree, cur, cur + iosize - 1, &cached);
+			unlock_extent(tree, cur, cur + iosize - 1, NULL);
 			end_page_read(page, true, cur, iosize);
 			cur = cur + iosize;
 			pg_offset += iosize;
-- 
cgit 


From 40daf3e095dbd1059d7e600d27e5bb987f563561 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 11 Nov 2022 11:50:28 +0000
Subject: btrfs: add an early exit when searching for delalloc range for
 lseek/fiemap

During fiemap and lseek (SEEK_HOLE/DATA), when looking for delalloc in a
range corresponding to a hole or a prealloc extent, if we found the whole
range marked as delalloc in the inode's io_tree, then we can terminate
immediately and avoid searching the extent map tree. If not, and if the
found delalloc starts at the same offset of our search start but ends
before our search range's end, then we can adjust the search range for
the search in the extent map tree. So implement those changes.

This change is part of a patchset that has the goal to make performance
better for applications that use lseek's SEEK_HOLE and SEEK_DATA modes to
iterate over the extents of a file. Two examples are the cp program from
coreutils 9.0+ and the tar program (when using its --sparse / -S option).
A sample test and results are listed in the changelog of the last patch
in the series:

  1/9 btrfs: remove leftover setting of EXTENT_UPTODATE state in an inode's io_tree
  2/9 btrfs: add an early exit when searching for delalloc range for lseek/fiemap
  3/9 btrfs: skip unnecessary delalloc searches during lseek/fiemap
  4/9 btrfs: search for delalloc more efficiently during lseek/fiemap
  5/9 btrfs: remove no longer used btrfs_next_extent_map()
  6/9 btrfs: allow passing a cached state record to count_range_bits()
  7/9 btrfs: update stale comment for count_range_bits()
  8/9 btrfs: use cached state when looking for delalloc ranges with fiemap
  9/9 btrfs: use cached state when looking for delalloc ranges with lseek

Reported-by: Wang Yugui <wangyugui@e16-tech.com>
Link: https://lore.kernel.org/linux-btrfs/20221106073028.71F9.409509F4@e16-tech.com/
Link: https://lore.kernel.org/linux-btrfs/CAL3q7H5NSVicm7nYBJ7x8fFkDpno8z3PYt5aPU43Bajc1H0h1Q@mail.gmail.com/
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e6c93be91a06..9b1f76109682 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3216,7 +3216,7 @@ out:
 static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end,
 				   u64 *delalloc_start_ret, u64 *delalloc_end_ret)
 {
-	const u64 len = end + 1 - start;
+	u64 len = end + 1 - start;
 	struct extent_map_tree *em_tree = &inode->extent_tree;
 	struct extent_map *em;
 	u64 em_end;
@@ -3242,13 +3242,23 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end
 		delalloc_len = 0;
 	}
 
-	/*
-	 * If delalloc was found then *delalloc_start_ret has a sector size
-	 * aligned value (rounded down).
-	 */
-	if (delalloc_len > 0)
+	if (delalloc_len > 0) {
+		/*
+		 * If delalloc was found then *delalloc_start_ret has a sector size
+		 * aligned value (rounded down).
+		 */
 		*delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1;
 
+		if (*delalloc_start_ret == start) {
+			/* Delalloc for the whole range, nothing more to do. */
+			if (*delalloc_end_ret == end)
+				return true;
+			/* Else trim our search range for extent maps. */
+			start = *delalloc_end_ret + 1;
+			len = end + 1 - start;
+		}
+	}
+
 	/*
 	 * No outstanding extents means we don't have any delalloc that is
 	 * flushing, so return the unflushed range found in the io tree (if any).
-- 
cgit 


From af979fd618a40009c5cae03de21403848b13a931 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 11 Nov 2022 11:50:29 +0000
Subject: btrfs: skip unnecessary delalloc searches during lseek/fiemap

During lseek (SEEK_HOLE/DATA) and fiemap, when processing a file range
that corresponds to a hole or a prealloc extent, if we find that there is
no delalloc marked in the inode's io_tree but there is delalloc due to
an extent map in the io tree, then on the next iteration that calls
find_delalloc_subrange() we can skip searching the io tree again, since
on the first call we had no delalloc in the io tree for the whole range.

This change is part of a patchset that has the goal to make performance
better for applications that use lseek's SEEK_HOLE and SEEK_DATA modes to
iterate over the extents of a file. Two examples are the cp program from
coreutils 9.0+ and the tar program (when using its --sparse / -S option).
A sample test and results are listed in the changelog of the last patch
in the series:

  1/9 btrfs: remove leftover setting of EXTENT_UPTODATE state in an inode's io_tree
  2/9 btrfs: add an early exit when searching for delalloc range for lseek/fiemap
  3/9 btrfs: skip unnecessary delalloc searches during lseek/fiemap
  4/9 btrfs: search for delalloc more efficiently during lseek/fiemap
  5/9 btrfs: remove no longer used btrfs_next_extent_map()
  6/9 btrfs: allow passing a cached state record to count_range_bits()
  7/9 btrfs: update stale comment for count_range_bits()
  8/9 btrfs: use cached state when looking for delalloc ranges with fiemap
  9/9 btrfs: use cached state when looking for delalloc ranges with lseek

Reported-by: Wang Yugui <wangyugui@e16-tech.com>
Link: https://lore.kernel.org/linux-btrfs/20221106073028.71F9.409509F4@e16-tech.com/
Link: https://lore.kernel.org/linux-btrfs/CAL3q7H5NSVicm7nYBJ7x8fFkDpno8z3PYt5aPU43Bajc1H0h1Q@mail.gmail.com/
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9b1f76109682..99cc95487d42 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3214,6 +3214,7 @@ out:
  * looping while it gets adjacent subranges, and merging them together.
  */
 static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end,
+				   bool *search_io_tree,
 				   u64 *delalloc_start_ret, u64 *delalloc_end_ret)
 {
 	u64 len = end + 1 - start;
@@ -3231,7 +3232,7 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end
 	spin_lock(&inode->lock);
 	outstanding_extents = inode->outstanding_extents;
 
-	if (inode->delalloc_bytes > 0) {
+	if (*search_io_tree && inode->delalloc_bytes > 0) {
 		spin_unlock(&inode->lock);
 		*delalloc_start_ret = start;
 		delalloc_len = count_range_bits(&inode->io_tree,
@@ -3257,6 +3258,9 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end
 			start = *delalloc_end_ret + 1;
 			len = end + 1 - start;
 		}
+	} else {
+		/* No delalloc, future calls don't need to search again. */
+		*search_io_tree = false;
 	}
 
 	/*
@@ -3390,6 +3394,7 @@ bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
 {
 	u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize);
 	u64 prev_delalloc_end = 0;
+	bool search_io_tree = true;
 	bool ret = false;
 
 	while (cur_offset < end) {
@@ -3398,6 +3403,7 @@ bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
 		bool delalloc;
 
 		delalloc = find_delalloc_subrange(inode, cur_offset, end,
+						  &search_io_tree,
 						  &delalloc_start,
 						  &delalloc_end);
 		if (!delalloc)
-- 
cgit 


From 8ddc8274e4be9a35eafc5bef9ff64d94f452d193 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 11 Nov 2022 11:50:30 +0000
Subject: btrfs: search for delalloc more efficiently during lseek/fiemap

During lseek (SEEK_HOLE/DATA) and fiemap, when processing a file range
that corresponds to a hole or a prealloc extent, we have to check if
there's any delalloc in the range. We do it by searching for delalloc
ranges in the inode's io_tree (for unflushed delalloc) and in the inode's
extent map tree (for delalloc that is flushing).

We avoid searching the extent map tree if the number of outstanding
extents is 0, as in that case we can't have extent maps for our search
range in the tree that correspond to delalloc that is flushing. However
if we have any unflushed delalloc, due to buffered writes or mmap writes,
then the outstanding extents counter is not 0 and we'll search the extent
map tree. The tree may be large because it can have lots of extent maps
that were loaded by reads or created by previous writes, therefore taking
a significant time to search the tree, specially if have a file with a
lot of holes and/or prealloc extents.

We can improve on this by instead of searching the extent map tree,
searching the ordered extents tree of the inode, since when delalloc is
flushing we create an ordered extent along with the new extent map, while
holding the respective file range locked in the inode's io_tree. The
ordered extents tree is typically much smaller, since ordered extents have
a short life and get removed from the tree once they are completed, while
extent maps can stay for a very long time in the extent map tree, either
created by previous writes or loaded by read operations.

So use the ordered extents tree instead of the extent maps tree.

This change is part of a patchset that has the goal to make performance
better for applications that use lseek's SEEK_HOLE and SEEK_DATA modes to
iterate over the extents of a file. Two examples are the cp program from
coreutils 9.0+ and the tar program (when using its --sparse / -S option).
A sample test and results are listed in the changelog of the last patch
in the series:

  1/9 btrfs: remove leftover setting of EXTENT_UPTODATE state in an inode's io_tree
  2/9 btrfs: add an early exit when searching for delalloc range for lseek/fiemap
  3/9 btrfs: skip unnecessary delalloc searches during lseek/fiemap
  4/9 btrfs: search for delalloc more efficiently during lseek/fiemap
  5/9 btrfs: remove no longer used btrfs_next_extent_map()
  6/9 btrfs: allow passing a cached state record to count_range_bits()
  7/9 btrfs: update stale comment for count_range_bits()
  8/9 btrfs: use cached state when looking for delalloc ranges with fiemap
  9/9 btrfs: use cached state when looking for delalloc ranges with lseek

Reported-by: Wang Yugui <wangyugui@e16-tech.com>
Link: https://lore.kernel.org/linux-btrfs/20221106073028.71F9.409509F4@e16-tech.com/
Link: https://lore.kernel.org/linux-btrfs/CAL3q7H5NSVicm7nYBJ7x8fFkDpno8z3PYt5aPU43Bajc1H0h1Q@mail.gmail.com/
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 152 ++++++++++++++++++--------------------------------------
 1 file changed, 48 insertions(+), 104 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 99cc95487d42..6bc2397e324c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3218,29 +3218,27 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end
 				   u64 *delalloc_start_ret, u64 *delalloc_end_ret)
 {
 	u64 len = end + 1 - start;
-	struct extent_map_tree *em_tree = &inode->extent_tree;
-	struct extent_map *em;
-	u64 em_end;
-	u64 delalloc_len;
-	unsigned int outstanding_extents;
+	u64 delalloc_len = 0;
+	struct btrfs_ordered_extent *oe;
+	u64 oe_start;
+	u64 oe_end;
 
 	/*
 	 * Search the io tree first for EXTENT_DELALLOC. If we find any, it
 	 * means we have delalloc (dirty pages) for which writeback has not
 	 * started yet.
 	 */
-	spin_lock(&inode->lock);
-	outstanding_extents = inode->outstanding_extents;
-
-	if (*search_io_tree && inode->delalloc_bytes > 0) {
-		spin_unlock(&inode->lock);
-		*delalloc_start_ret = start;
-		delalloc_len = count_range_bits(&inode->io_tree,
-						delalloc_start_ret, end,
-						len, EXTENT_DELALLOC, 1);
-	} else {
-		spin_unlock(&inode->lock);
-		delalloc_len = 0;
+	if (*search_io_tree) {
+		spin_lock(&inode->lock);
+		if (inode->delalloc_bytes > 0) {
+			spin_unlock(&inode->lock);
+			*delalloc_start_ret = start;
+			delalloc_len = count_range_bits(&inode->io_tree,
+							delalloc_start_ret, end,
+							len, EXTENT_DELALLOC, 1);
+		} else {
+			spin_unlock(&inode->lock);
+		}
 	}
 
 	if (delalloc_len > 0) {
@@ -3254,7 +3252,7 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end
 			/* Delalloc for the whole range, nothing more to do. */
 			if (*delalloc_end_ret == end)
 				return true;
-			/* Else trim our search range for extent maps. */
+			/* Else trim our search range for ordered extents. */
 			start = *delalloc_end_ret + 1;
 			len = end + 1 - start;
 		}
@@ -3264,110 +3262,56 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end
 	}
 
 	/*
-	 * No outstanding extents means we don't have any delalloc that is
-	 * flushing, so return the unflushed range found in the io tree (if any).
-	 */
-	if (outstanding_extents == 0)
-		return (delalloc_len > 0);
-
-	/*
-	 * Now also check if there's any extent map in the range that does not
-	 * map to a hole or prealloc extent. We do this because:
+	 * Now also check if there's any ordered extent in the range.
+	 * We do this because:
 	 *
 	 * 1) When delalloc is flushed, the file range is locked, we clear the
-	 *    EXTENT_DELALLOC bit from the io tree and create an extent map for
-	 *    an allocated extent. So we might just have been called after
-	 *    delalloc is flushed and before the ordered extent completes and
-	 *    inserts the new file extent item in the subvolume's btree;
+	 *    EXTENT_DELALLOC bit from the io tree and create an extent map and
+	 *    an ordered extent for the write. So we might just have been called
+	 *    after delalloc is flushed and before the ordered extent completes
+	 *    and inserts the new file extent item in the subvolume's btree;
 	 *
-	 * 2) We may have an extent map created by flushing delalloc for a
+	 * 2) We may have an ordered extent created by flushing delalloc for a
 	 *    subrange that starts before the subrange we found marked with
 	 *    EXTENT_DELALLOC in the io tree.
+	 *
+	 * We could also use the extent map tree to find such delalloc that is
+	 * being flushed, but using the ordered extents tree is more efficient
+	 * because it's usually much smaller as ordered extents are removed from
+	 * the tree once they complete. With the extent maps, we mau have them
+	 * in the extent map tree for a very long time, and they were either
+	 * created by previous writes or loaded by read operations.
 	 */
-	read_lock(&em_tree->lock);
-	em = lookup_extent_mapping(em_tree, start, len);
-	if (!em) {
-		read_unlock(&em_tree->lock);
+	oe = btrfs_lookup_first_ordered_range(inode, start, len);
+	if (!oe)
 		return (delalloc_len > 0);
-	}
-
-	/* extent_map_end() returns a non-inclusive end offset. */
-	em_end = extent_map_end(em);
-
-	/*
-	 * If we have a hole/prealloc extent map, check the next one if this one
-	 * ends before our range's end.
-	 */
-	if ((em->block_start == EXTENT_MAP_HOLE ||
-	     test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) && em_end < end) {
-		struct extent_map *next_em;
-
-		next_em = btrfs_next_extent_map(em_tree, em);
-		free_extent_map(em);
-
-		/*
-		 * There's no next extent map or the next one starts beyond our
-		 * range, return the range found in the io tree (if any).
-		 */
-		if (!next_em || next_em->start > end) {
-			read_unlock(&em_tree->lock);
-			free_extent_map(next_em);
-			return (delalloc_len > 0);
-		}
 
-		em_end = extent_map_end(next_em);
-		em = next_em;
-	}
-
-	read_unlock(&em_tree->lock);
+	/* The ordered extent may span beyond our search range. */
+	oe_start = max(oe->file_offset, start);
+	oe_end = min(oe->file_offset + oe->num_bytes - 1, end);
 
-	/*
-	 * We have a hole or prealloc extent that ends at or beyond our range's
-	 * end, return the range found in the io tree (if any).
-	 */
-	if (em->block_start == EXTENT_MAP_HOLE ||
-	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
-		free_extent_map(em);
-		return (delalloc_len > 0);
-	}
+	btrfs_put_ordered_extent(oe);
 
-	/*
-	 * We don't have any range as EXTENT_DELALLOC in the io tree, so the
-	 * extent map is the only subrange representing delalloc.
-	 */
+	/* Don't have unflushed delalloc, return the ordered extent range. */
 	if (delalloc_len == 0) {
-		*delalloc_start_ret = em->start;
-		*delalloc_end_ret = min(end, em_end - 1);
-		free_extent_map(em);
+		*delalloc_start_ret = oe_start;
+		*delalloc_end_ret = oe_end;
 		return true;
 	}
 
 	/*
-	 * The extent map represents a delalloc range that starts before the
-	 * delalloc range we found in the io tree.
+	 * We have both unflushed delalloc (io_tree) and an ordered extent.
+	 * If the ranges are adjacent returned a combined range, otherwise
+	 * return the leftmost range.
 	 */
-	if (em->start < *delalloc_start_ret) {
-		*delalloc_start_ret = em->start;
-		/*
-		 * If the ranges are adjacent, return a combined range.
-		 * Otherwise return the extent map's range.
-		 */
-		if (em_end < *delalloc_start_ret)
-			*delalloc_end_ret = min(end, em_end - 1);
-
-		free_extent_map(em);
-		return true;
+	if (oe_start < *delalloc_start_ret) {
+		if (oe_end < *delalloc_start_ret)
+			*delalloc_end_ret = oe_end;
+		*delalloc_start_ret = oe_start;
+	} else if (*delalloc_end_ret + 1 == oe_start) {
+		*delalloc_end_ret = oe_end;
 	}
 
-	/*
-	 * The extent map starts after the delalloc range we found in the io
-	 * tree. If it's adjacent, return a combined range, otherwise return
-	 * the range found in the io tree.
-	 */
-	if (*delalloc_end_ret + 1 == em->start)
-		*delalloc_end_ret = min(end, em_end - 1);
-
-	free_extent_map(em);
 	return true;
 }
 
-- 
cgit 


From cfd7a17d9b4588dd7a29e1a131257bee3e72b766 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 11 Nov 2022 11:50:31 +0000
Subject: btrfs: remove no longer used btrfs_next_extent_map()

There are no more users of btrfs_next_extent_map(), the previous patch
in the series ("btrfs: search for delalloc more efficiently during
lseek/fiemap") removed the last usage of the function, so delete it.

This change is part of a patchset that has the goal to make performance
better for applications that use lseek's SEEK_HOLE and SEEK_DATA modes to
iterate over the extents of a file. Two examples are the cp program from
coreutils 9.0+ and the tar program (when using its --sparse / -S option).
A sample test and results are listed in the changelog of the last patch
in the series:

  1/9 btrfs: remove leftover setting of EXTENT_UPTODATE state in an inode's io_tree
  2/9 btrfs: add an early exit when searching for delalloc range for lseek/fiemap
  3/9 btrfs: skip unnecessary delalloc searches during lseek/fiemap
  4/9 btrfs: search for delalloc more efficiently during lseek/fiemap
  5/9 btrfs: remove no longer used btrfs_next_extent_map()
  6/9 btrfs: allow passing a cached state record to count_range_bits()
  7/9 btrfs: update stale comment for count_range_bits()
  8/9 btrfs: use cached state when looking for delalloc ranges with fiemap
  9/9 btrfs: use cached state when looking for delalloc ranges with lseek

Reported-by: Wang Yugui <wangyugui@e16-tech.com>
Link: https://lore.kernel.org/linux-btrfs/20221106073028.71F9.409509F4@e16-tech.com/
Link: https://lore.kernel.org/linux-btrfs/CAL3q7H5NSVicm7nYBJ7x8fFkDpno8z3PYt5aPU43Bajc1H0h1Q@mail.gmail.com/
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_map.c | 29 -----------------------------
 fs/btrfs/extent_map.h |  2 --
 2 files changed, 31 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 4a4362f5cc52..be94030e1dfb 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -529,35 +529,6 @@ static struct extent_map *next_extent_map(const struct extent_map *em)
 	return container_of(next, struct extent_map, rb_node);
 }
 
-/*
- * Get the extent map that immediately follows another one.
- *
- * @tree:       The extent map tree that the extent map belong to.
- *              Holding read or write access on the tree's lock is required.
- * @em:         An extent map from the given tree. The caller must ensure that
- *              between getting @em and between calling this function, the
- *              extent map @em is not removed from the tree - for example, by
- *              holding the tree's lock for the duration of those 2 operations.
- *
- * Returns the extent map that immediately follows @em, or NULL if @em is the
- * last extent map in the tree.
- */
-struct extent_map *btrfs_next_extent_map(const struct extent_map_tree *tree,
-					 const struct extent_map *em)
-{
-	struct extent_map *next;
-
-	/* The lock must be acquired either in read mode or write mode. */
-	lockdep_assert_held(&tree->lock);
-	ASSERT(extent_map_in_tree(em));
-
-	next = next_extent_map(em);
-	if (next)
-		refcount_inc(&next->refs);
-
-	return next;
-}
-
 static struct extent_map *prev_extent_map(struct extent_map *em)
 {
 	struct rb_node *prev;
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 68d3f2c9ea1d..ad311864272a 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -87,8 +87,6 @@ static inline u64 extent_map_block_end(struct extent_map *em)
 void extent_map_tree_init(struct extent_map_tree *tree);
 struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
 					 u64 start, u64 len);
-struct extent_map *btrfs_next_extent_map(const struct extent_map_tree *tree,
-					 const struct extent_map *em);
 int add_extent_mapping(struct extent_map_tree *tree,
 		       struct extent_map *em, int modified);
 void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
-- 
cgit 


From 8c6e53a79d16b3651ad3abeb415e1c637da75082 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 11 Nov 2022 11:50:32 +0000
Subject: btrfs: allow passing a cached state record to count_range_bits()

An inode's io_tree can be quite large and there are cases where due to
delalloc it can have thousands of extent state records, which makes the
red black tree have a depth of 10 or more, making the operation of
count_range_bits() slow if we repeatedly call it for a range that starts
where, or after, the previous one we called it for. Such use cases are
when searching for delalloc in a file range that corresponds to a hole or
a prealloc extent, which is done during lseek SEEK_HOLE/DATA and fiemap.

So introduce a cached state parameter to count_range_bits() which we use
to store the last extent state record we visited, and then allow the
caller to pass it again on its next call to count_range_bits(). The next
patches in the series will make fiemap and lseek use the new parameter.

This change is part of a patchset that has the goal to make performance
better for applications that use lseek's SEEK_HOLE and SEEK_DATA modes to
iterate over the extents of a file. Two examples are the cp program from
coreutils 9.0+ and the tar program (when using its --sparse / -S option).
A sample test and results are listed in the changelog of the last patch
in the series:

  1/9 btrfs: remove leftover setting of EXTENT_UPTODATE state in an inode's io_tree
  2/9 btrfs: add an early exit when searching for delalloc range for lseek/fiemap
  3/9 btrfs: skip unnecessary delalloc searches during lseek/fiemap
  4/9 btrfs: search for delalloc more efficiently during lseek/fiemap
  5/9 btrfs: remove no longer used btrfs_next_extent_map()
  6/9 btrfs: allow passing a cached state record to count_range_bits()
  7/9 btrfs: update stale comment for count_range_bits()
  8/9 btrfs: use cached state when looking for delalloc ranges with fiemap
  9/9 btrfs: use cached state when looking for delalloc ranges with lseek

Reported-by: Wang Yugui <wangyugui@e16-tech.com>
Link: https://lore.kernel.org/linux-btrfs/20221106073028.71F9.409509F4@e16-tech.com/
Link: https://lore.kernel.org/linux-btrfs/CAL3q7H5NSVicm7nYBJ7x8fFkDpno8z3PYt5aPU43Bajc1H0h1Q@mail.gmail.com/
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-io-tree.c | 47 ++++++++++++++++++++++++++++++++++++++++++++---
 fs/btrfs/extent-io-tree.h |  3 ++-
 fs/btrfs/file.c           |  3 ++-
 fs/btrfs/inode.c          |  2 +-
 4 files changed, 49 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index 285b0ff6e953..6b0d78df7eee 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -1521,9 +1521,11 @@ out:
  */
 u64 count_range_bits(struct extent_io_tree *tree,
 		     u64 *start, u64 search_end, u64 max_bytes,
-		     u32 bits, int contig)
+		     u32 bits, int contig,
+		     struct extent_state **cached_state)
 {
-	struct extent_state *state;
+	struct extent_state *state = NULL;
+	struct extent_state *cached;
 	u64 cur_start = *start;
 	u64 total_bytes = 0;
 	u64 last = 0;
@@ -1534,11 +1536,41 @@ u64 count_range_bits(struct extent_io_tree *tree,
 
 	spin_lock(&tree->lock);
 
+	if (!cached_state || !*cached_state)
+		goto search;
+
+	cached = *cached_state;
+
+	if (!extent_state_in_tree(cached))
+		goto search;
+
+	if (cached->start <= cur_start && cur_start <= cached->end) {
+		state = cached;
+	} else if (cached->start > cur_start) {
+		struct extent_state *prev;
+
+		/*
+		 * The cached state starts after our search range's start. Check
+		 * if the previous state record starts at or before the range we
+		 * are looking for, and if so, use it - this is a common case
+		 * when there are holes between records in the tree. If there is
+		 * no previous state record, we can start from our cached state.
+		 */
+		prev = prev_state(cached);
+		if (!prev)
+			state = cached;
+		else if (prev->start <= cur_start && cur_start <= prev->end)
+			state = prev;
+	}
+
 	/*
 	 * This search will find all the extents that end after our range
 	 * starts.
 	 */
-	state = tree_search(tree, cur_start);
+search:
+	if (!state)
+		state = tree_search(tree, cur_start);
+
 	while (state) {
 		if (state->start > search_end)
 			break;
@@ -1559,7 +1591,16 @@ u64 count_range_bits(struct extent_io_tree *tree,
 		}
 		state = next_state(state);
 	}
+
+	if (cached_state) {
+		free_extent_state(*cached_state);
+		*cached_state = state;
+		if (state)
+			refcount_inc(&state->refs);
+	}
+
 	spin_unlock(&tree->lock);
+
 	return total_bytes;
 }
 
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index 18ab82f62611..e3eeec380844 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -119,7 +119,8 @@ void __cold extent_state_free_cachep(void);
 
 u64 count_range_bits(struct extent_io_tree *tree,
 		     u64 *start, u64 search_end,
-		     u64 max_bytes, u32 bits, int contig);
+		     u64 max_bytes, u32 bits, int contig,
+		     struct extent_state **cached_state);
 
 void free_extent_state(struct extent_state *state);
 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 6bc2397e324c..dc8399610ca3 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3235,7 +3235,8 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end
 			*delalloc_start_ret = start;
 			delalloc_len = count_range_bits(&inode->io_tree,
 							delalloc_start_ret, end,
-							len, EXTENT_DELALLOC, 1);
+							len, EXTENT_DELALLOC, 1,
+							NULL);
 		} else {
 			spin_unlock(&inode->lock);
 		}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index aec1b232a71c..83898bca39d5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1769,7 +1769,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
 	 * when starting writeback.
 	 */
 	count = count_range_bits(io_tree, &range_start, end, range_bytes,
-				 EXTENT_NORESERVE, 0);
+				 EXTENT_NORESERVE, 0, NULL);
 	if (count > 0 || is_space_ino || is_reloc_ino) {
 		u64 bytes = count;
 		struct btrfs_fs_info *fs_info = inode->root->fs_info;
-- 
cgit 


From 1ee51a06255d425c1b7effff095f0bf9c7240078 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 11 Nov 2022 11:50:33 +0000
Subject: btrfs: update stale comment for count_range_bits()

The comment for count_range_bits() mentions that the search is fast if we
are asking for a range with the EXTENT_DIRTY bit set. However that is no
longer true since we don't use that bit and the optimization for that was
removed in:

  commit 71528e9e16c7 ("btrfs: get rid of extent_io_tree::dirty_bytes")

So remove that part of the comment mentioning the no longer existing
optimized case, and, while at it, add proper documentation describing the
purpose, arguments and return value of the function.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-io-tree.c | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index 6b0d78df7eee..21fa15123af8 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -1515,9 +1515,29 @@ out:
 }
 
 /*
- * Count the number of bytes in the tree that have a given bit(s) set.  This
- * can be fairly slow, except for EXTENT_DIRTY which is cached.  The total
- * number found is returned.
+ * Count the number of bytes in the tree that have a given bit(s) set for a
+ * given range.
+ *
+ * @tree:         The io tree to search.
+ * @start:        The start offset of the range. This value is updated to the
+ *                offset of the first byte found with the given bit(s), so it
+ *                can end up being bigger than the initial value.
+ * @search_end:   The end offset (inclusive value) of the search range.
+ * @max_bytes:    The maximum byte count we are interested. The search stops
+ *                once it reaches this count.
+ * @bits:         The bits the range must have in order to be accounted for.
+ *                If multiple bits are set, then only subranges that have all
+ *                the bits set are accounted for.
+ * @contig:       Indicate if we should ignore holes in the range or not. If
+ *                this is true, then stop once we find a hole.
+ * @cached_state: A cached state to be used across multiple calls to this
+ *                function in order to speedup searches. Use NULL if this is
+ *                called only once or if each call does not start where the
+ *                previous one ended.
+ *
+ * Returns the total number of bytes found within the given range that have
+ * all given bits set. If the returned number of bytes is greater than zero
+ * then @start is updated with the offset of the first byte with the bits set.
  */
 u64 count_range_bits(struct extent_io_tree *tree,
 		     u64 *start, u64 search_end, u64 max_bytes,
-- 
cgit 


From b3e744fe6d289bec77b138b6201201e4148dcb0e Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 11 Nov 2022 11:50:34 +0000
Subject: btrfs: use cached state when looking for delalloc ranges with fiemap

During fiemap, whenever we find a hole or prealloc extent, we will look
for delalloc in that range, and one of the things we do for that is to
find out ranges in the inode's io_tree marked with EXTENT_DELALLOC, using
calls to count_range_bits().

Since we process file extents from left to right, if we have a file with
several holes or prealloc extents, we benefit from keeping a cached extent
state record for calls to count_range_bits(). Most of the time the last
extent state record we visited in one call to count_range_bits() matches
the first extent state record we will use in the next call to
count_range_bits(), so there's a benefit here. So use an extent state
record to cache results from count_range_bits() calls during fiemap.

This change is part of a patchset that has the goal to make performance
better for applications that use lseek's SEEK_HOLE and SEEK_DATA modes to
iterate over the extents of a file. Two examples are the cp program from
coreutils 9.0+ and the tar program (when using its --sparse / -S option).
A sample test and results are listed in the changelog of the last patch
in the series:

  1/9 btrfs: remove leftover setting of EXTENT_UPTODATE state in an inode's io_tree
  2/9 btrfs: add an early exit when searching for delalloc range for lseek/fiemap
  3/9 btrfs: skip unnecessary delalloc searches during lseek/fiemap
  4/9 btrfs: search for delalloc more efficiently during lseek/fiemap
  5/9 btrfs: remove no longer used btrfs_next_extent_map()
  6/9 btrfs: allow passing a cached state record to count_range_bits()
  7/9 btrfs: update stale comment for count_range_bits()
  8/9 btrfs: use cached state when looking for delalloc ranges with fiemap
  9/9 btrfs: use cached state when looking for delalloc ranges with lseek

Reported-by: Wang Yugui <wangyugui@e16-tech.com>
Link: https://lore.kernel.org/linux-btrfs/20221106073028.71F9.409509F4@e16-tech.com/
Link: https://lore.kernel.org/linux-btrfs/CAL3q7H5NSVicm7nYBJ7x8fFkDpno8z3PYt5aPU43Bajc1H0h1Q@mail.gmail.com/
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 11 ++++++++++-
 fs/btrfs/file.c      | 10 +++++++---
 fs/btrfs/file.h      |  1 +
 3 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 33cc3a9a2b68..704ae7c08867 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3698,6 +3698,7 @@ static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path
 static int fiemap_process_hole(struct btrfs_inode *inode,
 			       struct fiemap_extent_info *fieinfo,
 			       struct fiemap_cache *cache,
+			       struct extent_state **delalloc_cached_state,
 			       struct btrfs_backref_share_check_ctx *backref_ctx,
 			       u64 disk_bytenr, u64 extent_offset,
 			       u64 extent_gen,
@@ -3722,6 +3723,7 @@ static int fiemap_process_hole(struct btrfs_inode *inode,
 		bool delalloc;
 
 		delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end,
+							delalloc_cached_state,
 							&delalloc_start,
 							&delalloc_end);
 		if (!delalloc)
@@ -3892,6 +3894,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
 {
 	const u64 ino = btrfs_ino(inode);
 	struct extent_state *cached_state = NULL;
+	struct extent_state *delalloc_cached_state = NULL;
 	struct btrfs_path *path;
 	struct fiemap_cache cache = { 0 };
 	struct btrfs_backref_share_check_ctx *backref_ctx;
@@ -3966,6 +3969,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
 			const u64 range_end = min(key.offset, lockend) - 1;
 
 			ret = fiemap_process_hole(inode, fieinfo, &cache,
+						  &delalloc_cached_state,
 						  backref_ctx, 0, 0, 0,
 						  prev_extent_end, range_end);
 			if (ret < 0) {
@@ -4006,6 +4010,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
 						 extent_len, flags);
 		} else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
 			ret = fiemap_process_hole(inode, fieinfo, &cache,
+						  &delalloc_cached_state,
 						  backref_ctx,
 						  disk_bytenr, extent_offset,
 						  extent_gen, key.offset,
@@ -4013,6 +4018,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
 		} else if (disk_bytenr == 0) {
 			/* We have an explicit hole. */
 			ret = fiemap_process_hole(inode, fieinfo, &cache,
+						  &delalloc_cached_state,
 						  backref_ctx, 0, 0, 0,
 						  key.offset, extent_end - 1);
 		} else {
@@ -4070,7 +4076,8 @@ check_eof_delalloc:
 	path = NULL;
 
 	if (!stopped && prev_extent_end < lockend) {
-		ret = fiemap_process_hole(inode, fieinfo, &cache, backref_ctx,
+		ret = fiemap_process_hole(inode, fieinfo, &cache,
+					  &delalloc_cached_state, backref_ctx,
 					  0, 0, 0, prev_extent_end, lockend - 1);
 		if (ret < 0)
 			goto out_unlock;
@@ -4088,6 +4095,7 @@ check_eof_delalloc:
 			delalloc = btrfs_find_delalloc_in_range(inode,
 								prev_extent_end,
 								i_size - 1,
+								&delalloc_cached_state,
 								&delalloc_start,
 								&delalloc_end);
 			if (!delalloc)
@@ -4102,6 +4110,7 @@ check_eof_delalloc:
 out_unlock:
 	unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
 out:
+	free_extent_state(delalloc_cached_state);
 	btrfs_free_backref_share_ctx(backref_ctx);
 	btrfs_free_path(path);
 	return ret;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index dc8399610ca3..da390f891220 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3214,6 +3214,7 @@ out:
  * looping while it gets adjacent subranges, and merging them together.
  */
 static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end,
+				   struct extent_state **cached_state,
 				   bool *search_io_tree,
 				   u64 *delalloc_start_ret, u64 *delalloc_end_ret)
 {
@@ -3236,7 +3237,7 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end
 			delalloc_len = count_range_bits(&inode->io_tree,
 							delalloc_start_ret, end,
 							len, EXTENT_DELALLOC, 1,
-							NULL);
+							cached_state);
 		} else {
 			spin_unlock(&inode->lock);
 		}
@@ -3324,6 +3325,8 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end
  *                       sector size aligned.
  * @end:                 The end offset (inclusive value) of the search range.
  *                       It does not need to be sector size aligned.
+ * @cached_state:        Extent state record used for speeding up delalloc
+ *                       searches in the inode's io_tree. Can be NULL.
  * @delalloc_start_ret:  Output argument, set to the start offset of the
  *                       subrange found with delalloc (may not be sector size
  *                       aligned).
@@ -3335,6 +3338,7 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end
  * end offsets of the subrange.
  */
 bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
+				  struct extent_state **cached_state,
 				  u64 *delalloc_start_ret, u64 *delalloc_end_ret)
 {
 	u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize);
@@ -3348,7 +3352,7 @@ bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
 		bool delalloc;
 
 		delalloc = find_delalloc_subrange(inode, cur_offset, end,
-						  &search_io_tree,
+						  cached_state, &search_io_tree,
 						  &delalloc_start,
 						  &delalloc_end);
 		if (!delalloc)
@@ -3400,7 +3404,7 @@ static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence,
 	u64 delalloc_end;
 	bool delalloc;
 
-	delalloc = btrfs_find_delalloc_in_range(inode, start, end,
+	delalloc = btrfs_find_delalloc_in_range(inode, start, end, NULL,
 						&delalloc_start, &delalloc_end);
 	if (delalloc && whence == SEEK_DATA) {
 		*start_ret = delalloc_start;
diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h
index f3d794e33d46..82b34fbb295f 100644
--- a/fs/btrfs/file.h
+++ b/fs/btrfs/file.h
@@ -27,6 +27,7 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
 			   size_t *write_bytes, bool nowait);
 void btrfs_check_nocow_unlock(struct btrfs_inode *inode);
 bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
+				  struct extent_state **cached_state,
 				  u64 *delalloc_start_ret, u64 *delalloc_end_ret);
 
 #endif
-- 
cgit 


From 3c32c7212f1639471ec0197ff1179b8ef2e0f3d3 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 11 Nov 2022 11:50:35 +0000
Subject: btrfs: use cached state when looking for delalloc ranges with lseek

During lseek (SEEK_HOLE/DATA), whenever we find a hole or prealloc extent,
we will look for delalloc in that range, and one of the things we do for
that is to find out ranges in the inode's io_tree marked with
EXTENT_DELALLOC, using calls to count_range_bits().

Typically there's a single, or few, searches in the io_tree for delalloc
per lseek call. However it's common for applications to keep calling
lseek with SEEK_HOLE and SEEK_DATA to find where extents and holes are in
a file, read the extents and skip holes in order to avoid unnecessary IO
and save disk space by preserving holes.

One popular user is the cp utility from coreutils. Starting with coreutils
9.0, cp uses SEEK_HOLE and SEEK_DATA to iterate over the extents of a
file. Before 9.0, it used fiemap to figure out where holes and extents are
in the source file. Another popular user is the tar utility when used with
the --sparse / -S option to detect and preserve holes.

Given that the pattern is to keep calling lseek with a start offset that
matches the returned offset from the previous lseek call, we can benefit
from caching the last extent state visited in count_range_bits() and use
it for the next count_range_bits() from the next lseek call. Example,
the following strace excerpt from running tar:

   $ strace tar cJSvf foo.tar.xz qemu_disk_file.raw
   (...)
   lseek(5, 125019574272, SEEK_HOLE)       = 125024989184
   lseek(5, 125024989184, SEEK_DATA)       = 125024993280
   lseek(5, 125024993280, SEEK_HOLE)       = 125025239040
   lseek(5, 125025239040, SEEK_DATA)       = 125025255424
   lseek(5, 125025255424, SEEK_HOLE)       = 125025353728
   lseek(5, 125025353728, SEEK_DATA)       = 125025357824
   lseek(5, 125025357824, SEEK_HOLE)       = 125026766848
   lseek(5, 125026766848, SEEK_DATA)       = 125026770944
   lseek(5, 125026770944, SEEK_HOLE)       = 125027053568
   (...)

Shows that pattern, which is the same as with cp from coreutils 9.0+.

So start using a cached state for the delalloc searches in lseek, and
store it in struct file's private data so that it can be reused across
lseek calls.

This change is part of a patchset that is comprised of the following
patches:

  1/9 btrfs: remove leftover setting of EXTENT_UPTODATE state in an inode's io_tree
  2/9 btrfs: add an early exit when searching for delalloc range for lseek/fiemap
  3/9 btrfs: skip unnecessary delalloc searches during lseek/fiemap
  4/9 btrfs: search for delalloc more efficiently during lseek/fiemap
  5/9 btrfs: remove no longer used btrfs_next_extent_map()
  6/9 btrfs: allow passing a cached state record to count_range_bits()
  7/9 btrfs: update stale comment for count_range_bits()
  8/9 btrfs: use cached state when looking for delalloc ranges with fiemap
  9/9 btrfs: use cached state when looking for delalloc ranges with lseek

The following test was run before and after applying the whole patchset:

   $ cat test-cp.sh
   #!/bin/bash

   DEV=/dev/sdh
   MNT=/mnt/sdh

   # coreutils 8.32, cp uses fiemap to detect holes and extents
   #CP_PROG=/usr/bin/cp
   # coreutils 9.1, cp uses SEEK_HOLE/DATA to detect holes and extents
   CP_PROG=/home/fdmanana/git/hub/coreutils/src/cp

   umount $DEV &> /dev/null
   mkfs.btrfs -f $DEV
   mount $DEV $MNT

   FILE_SIZE=$((1024 * 1024 * 1024))
   echo "Creating file with a size of $((FILE_SIZE / 1024 / 1024))M"
   # Create a very sparse file, where each extent has a length of 4K and
   # is preceded by a 4K hole and followed by another 4K hole.
   start=$(date +%s%N)
   echo -n > $MNT/foobar
   for ((off = 0; off < $FILE_SIZE; off += 8192)); do
           xfs_io -c "pwrite -S 0xab $off 4K" $MNT/foobar > /dev/null
           echo -ne "\r$off / $FILE_SIZE ..."
   done
   end=$(date +%s%N)
   echo -e "\nFile created ($(( (end - start) / 1000000 )) milliseconds)"

   start=$(date +%s%N)
   $CP_PROG $MNT/foobar /dev/null
   end=$(date +%s%N)
   dur=$(( (end - start) / 1000000 ))
   echo "cp took $dur milliseconds with data/metadata cached and delalloc"

   # Flush all delalloc.
   sync

   start=$(date +%s%N)
   $CP_PROG $MNT/foobar /dev/null
   end=$(date +%s%N)
   dur=$(( (end - start) / 1000000 ))
   echo "cp took $dur milliseconds with data/metadata cached and no delalloc"

   # Unmount and mount again to test the case without any metadata
   # loaded in memory.
   umount $MNT
   mount $DEV $MNT

   start=$(date +%s%N)
   $CP_PROG $MNT/foobar /dev/null
   end=$(date +%s%N)
   dur=$(( (end - start) / 1000000 ))
   echo "cp took $dur milliseconds without data/metadata cached and no delalloc"

   umount $MNT

The results, running on a box with a non-debug kernel (Debian's default
kernel config), were the following:

128M file, before patchset:

   cp took 16574 milliseconds with data/metadata cached and delalloc
   cp took 122 milliseconds with data/metadata cached and no delalloc
   cp took 20144 milliseconds without data/metadata cached and no delalloc

128M file, after patchset:

   cp took 6277 milliseconds with data/metadata cached and delalloc
   cp took 109 milliseconds with data/metadata cached and no delalloc
   cp took 210 milliseconds without data/metadata cached and no delalloc

512M file, before patchset:

   cp took 14369 milliseconds with data/metadata cached and delalloc
   cp took 429 milliseconds with data/metadata cached and no delalloc
   cp took 88034 milliseconds without data/metadata cached and no delalloc

512M file, after patchset:

   cp took 12106 milliseconds with data/metadata cached and delalloc
   cp took 427 milliseconds with data/metadata cached and no delalloc
   cp took 824 milliseconds without data/metadata cached and no delalloc

1G file, before patchset:

   cp took 10074 milliseconds with data/metadata cached and delalloc
   cp took 886 milliseconds with data/metadata cached and no delalloc
   cp took 181261 milliseconds without data/metadata cached and no delalloc

1G file, after patchset:

   cp took 3320 milliseconds with data/metadata cached and delalloc
   cp took 880 milliseconds with data/metadata cached and no delalloc
   cp took 1801 milliseconds without data/metadata cached and no delalloc

Reported-by: Wang Yugui <wangyugui@e16-tech.com>
Link: https://lore.kernel.org/linux-btrfs/20221106073028.71F9.409509F4@e16-tech.com/
Link: https://lore.kernel.org/linux-btrfs/CAL3q7H5NSVicm7nYBJ7x8fFkDpno8z3PYt5aPU43Bajc1H0h1Q@mail.gmail.com/
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h |  1 +
 fs/btrfs/file.c  | 40 ++++++++++++++++++++++++++++++++--------
 2 files changed, 33 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5649f8907984..99defab7e1f6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -426,6 +426,7 @@ struct btrfs_drop_extents_args {
 
 struct btrfs_file_private {
 	void *filldir_buf;
+	struct extent_state *llseek_cached_state;
 };
 
 static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index da390f891220..448b143a5cb2 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1696,10 +1696,12 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
 {
 	struct btrfs_file_private *private = filp->private_data;
 
-	if (private && private->filldir_buf)
+	if (private) {
 		kfree(private->filldir_buf);
-	kfree(private);
-	filp->private_data = NULL;
+		free_extent_state(private->llseek_cached_state);
+		kfree(private);
+		filp->private_data = NULL;
+	}
 
 	/*
 	 * Set by setattr when we are about to truncate a file from a non-zero
@@ -3398,13 +3400,14 @@ bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
  * is found, it updates @start_ret with the start of the subrange.
  */
 static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence,
+					struct extent_state **cached_state,
 					u64 start, u64 end, u64 *start_ret)
 {
 	u64 delalloc_start;
 	u64 delalloc_end;
 	bool delalloc;
 
-	delalloc = btrfs_find_delalloc_in_range(inode, start, end, NULL,
+	delalloc = btrfs_find_delalloc_in_range(inode, start, end, cached_state,
 						&delalloc_start, &delalloc_end);
 	if (delalloc && whence == SEEK_DATA) {
 		*start_ret = delalloc_start;
@@ -3447,11 +3450,13 @@ static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence,
 	return false;
 }
 
-static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
-				  int whence)
+static loff_t find_desired_extent(struct file *file, loff_t offset, int whence)
 {
+	struct btrfs_inode *inode = BTRFS_I(file->f_mapping->host);
+	struct btrfs_file_private *private = file->private_data;
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	struct extent_state *cached_state = NULL;
+	struct extent_state **delalloc_cached_state;
 	const loff_t i_size = i_size_read(&inode->vfs_inode);
 	const u64 ino = btrfs_ino(inode);
 	struct btrfs_root *root = inode->root;
@@ -3476,6 +3481,22 @@ static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
 	    inode_get_bytes(&inode->vfs_inode) == i_size)
 		return i_size;
 
+	if (!private) {
+		private = kzalloc(sizeof(*private), GFP_KERNEL);
+		/*
+		 * No worries if memory allocation failed.
+		 * The private structure is used only for speeding up multiple
+		 * lseek SEEK_HOLE/DATA calls to a file when there's delalloc,
+		 * so everything will still be correct.
+		 */
+		file->private_data = private;
+	}
+
+	if (private)
+		delalloc_cached_state = &private->llseek_cached_state;
+	else
+		delalloc_cached_state = NULL;
+
 	/*
 	 * offset can be negative, in this case we start finding DATA/HOLE from
 	 * the very start of the file.
@@ -3553,6 +3574,7 @@ static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
 				search_start = offset;
 
 			found = find_desired_extent_in_hole(inode, whence,
+							    delalloc_cached_state,
 							    search_start,
 							    key.offset - 1,
 							    &found_start);
@@ -3587,6 +3609,7 @@ static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
 				search_start = offset;
 
 			found = find_desired_extent_in_hole(inode, whence,
+							    delalloc_cached_state,
 							    search_start,
 							    extent_end - 1,
 							    &found_start);
@@ -3628,7 +3651,8 @@ static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
 
 	/* We have an implicit hole from the last extent found up to i_size. */
 	if (!found && start < i_size) {
-		found = find_desired_extent_in_hole(inode, whence, start,
+		found = find_desired_extent_in_hole(inode, whence,
+						    delalloc_cached_state, start,
 						    i_size - 1, &start);
 		if (!found)
 			start = i_size;
@@ -3657,7 +3681,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
 	case SEEK_DATA:
 	case SEEK_HOLE:
 		btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
-		offset = find_desired_extent(BTRFS_I(inode), offset, whence);
+		offset = find_desired_extent(file, offset, whence);
 		btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
 		break;
 	}
-- 
cgit 


From cb3e217bdb39e390f8e64af519acb02af336b53d Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Sun, 13 Nov 2022 09:32:07 +0800
Subject: btrfs: use btrfs_dev_name() helper to handle missing devices better

[BUG]
If dev-replace failed to re-construct its data/metadata, the kernel
message would be incorrect for the missing device:

 BTRFS info (device dm-1): dev_replace from <missing disk> (devid 2) to /dev/mapper/test-scratch2 started
 BTRFS error (device dm-1): failed to rebuild valid logical 38862848 for dev (efault)

Note the above "dev (efault)" of the second line.
While the first line is properly reporting "<missing disk>".

[CAUSE]
Although dev-replace is using btrfs_dev_name(), the heavy lifting work
is still done by scrub (scrub is reused by both dev-replace and regular
scrub).

Unfortunately scrub code never uses btrfs_dev_name() helper, as it's
only declared locally inside dev-replace.c.

[FIX]
Fix the output by:

- Move the btrfs_dev_name() helper to volumes.h

- Use btrfs_dev_name() to replace open-coded rcu_str_deref() calls
  Only zoned code is not touched, as I'm not familiar with degraded
  zoned code.

- Constify return value and parameter

Now the output looks pretty sane:

 BTRFS info (device dm-1): dev_replace from <missing disk> (devid 2) to /dev/mapper/test-scratch2 started
 BTRFS error (device dm-1): failed to rebuild valid logical 38862848 for dev <missing disk>

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/check-integrity.c |  2 +-
 fs/btrfs/dev-replace.c     | 15 +++------------
 fs/btrfs/disk-io.c         |  2 +-
 fs/btrfs/extent-tree.c     |  2 +-
 fs/btrfs/extent_io.c       |  3 +--
 fs/btrfs/ioctl.c           |  4 ++--
 fs/btrfs/scrub.c           | 20 +++++++++-----------
 fs/btrfs/super.c           |  2 +-
 fs/btrfs/volumes.c         | 16 ++++++++--------
 fs/btrfs/volumes.h         |  9 +++++++++
 10 files changed, 36 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 7ff0703ef3e4..82e49d985019 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -757,7 +757,7 @@ static int btrfsic_process_superblock_dev_mirror(
 			btrfs_info_in_rcu(fs_info,
 			"new initial S-block (bdev %p, %s) @%llu (%pg/%llu/%d)",
 				     superblock_bdev,
-				     rcu_str_deref(device->name), dev_bytenr,
+				     btrfs_dev_name(device), dev_bytenr,
 				     dev_state->bdev, dev_bytenr,
 				     superblock_mirror_num);
 		list_add(&superblock_tmp->all_blocks_node,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 9c4a8649a0f4..78696d331639 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -18,7 +18,6 @@
 #include "volumes.h"
 #include "async-thread.h"
 #include "check-integrity.h"
-#include "rcu-string.h"
 #include "dev-replace.h"
 #include "sysfs.h"
 #include "zoned.h"
@@ -451,14 +450,6 @@ out:
 	return ret;
 }
 
-static char* btrfs_dev_name(struct btrfs_device *device)
-{
-	if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
-		return "<missing disk>";
-	else
-		return rcu_str_deref(device->name);
-}
-
 static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
 				    struct btrfs_device *src_dev)
 {
@@ -674,7 +665,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
 		      "dev_replace from %s (devid %llu) to %s started",
 		      btrfs_dev_name(src_device),
 		      src_device->devid,
-		      rcu_str_deref(tgt_device->name));
+		      btrfs_dev_name(tgt_device));
 
 	/*
 	 * from now on, the writes to the srcdev are all duplicated to
@@ -933,7 +924,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 				 "btrfs_scrub_dev(%s, %llu, %s) failed %d",
 				 btrfs_dev_name(src_device),
 				 src_device->devid,
-				 rcu_str_deref(tgt_device->name), scrub_ret);
+				 btrfs_dev_name(tgt_device), scrub_ret);
 error:
 		up_write(&dev_replace->rwsem);
 		mutex_unlock(&fs_info->chunk_mutex);
@@ -951,7 +942,7 @@ error:
 			  "dev_replace from %s (devid %llu) to %s finished",
 			  btrfs_dev_name(src_device),
 			  src_device->devid,
-			  rcu_str_deref(tgt_device->name));
+			  btrfs_dev_name(tgt_device));
 	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &tgt_device->dev_state);
 	tgt_device->devid = src_device->devid;
 	src_device->devid = BTRFS_DEV_REPLACE_DEVID;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 559e7f3d727e..91a088210e5a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3944,7 +3944,7 @@ static void btrfs_end_super_write(struct bio *bio)
 		if (bio->bi_status) {
 			btrfs_warn_rl_in_rcu(device->fs_info,
 				"lost page write due to IO error on %s (%d)",
-				rcu_str_deref(device->name),
+				btrfs_dev_name(device),
 				blk_status_to_errno(bio->bi_status));
 			ClearPageUptodate(page);
 			SetPageError(page);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 10cc757a602b..17f599027c3d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6048,7 +6048,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
 			btrfs_warn_in_rcu(fs_info,
 "ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
 					  start, end - start + 1,
-					  rcu_str_deref(device->name),
+					  btrfs_dev_name(device),
 					  device->total_bytes);
 			mutex_unlock(&fs_info->chunk_mutex);
 			ret = 0;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 704ae7c08867..65ba5c3658cf 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -611,8 +611,7 @@ static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
 
 	btrfs_info_rl_in_rcu(fs_info,
 		"read error corrected: ino %llu off %llu (dev %s sector %llu)",
-				  ino, start,
-				  rcu_str_deref(dev->name), sector);
+			     ino, start, btrfs_dev_name(dev), sector);
 	ret = 0;
 
 out_bio_uninit:
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 7a9e697d9931..bed74a3ff574 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1228,7 +1228,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
 	if (ret == 0 && new_size != old_size)
 		btrfs_info_in_rcu(fs_info,
 			"resize device %s (devid %llu) from %llu to %llu",
-			rcu_str_deref(device->name), device->devid,
+			btrfs_dev_name(device), device->devid,
 			old_size, new_size);
 out_finish:
 	btrfs_exclop_finish(fs_info);
@@ -2860,7 +2860,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
 	di_args->total_bytes = btrfs_device_get_total_bytes(dev);
 	memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
 	if (dev->name) {
-		strncpy(di_args->path, rcu_str_deref(dev->name),
+		strncpy(di_args->path, btrfs_dev_name(dev),
 				sizeof(di_args->path) - 1);
 		di_args->path[sizeof(di_args->path) - 1] = 0;
 	} else {
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 288a97992aa4..0ce5b773867f 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -17,7 +17,6 @@
 #include "extent_io.h"
 #include "dev-replace.h"
 #include "check-integrity.h"
-#include "rcu-string.h"
 #include "raid56.h"
 #include "block-group.h"
 #include "zoned.h"
@@ -877,7 +876,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
 		btrfs_warn_in_rcu(fs_info,
 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
 				  swarn->errstr, swarn->logical,
-				  rcu_str_deref(swarn->dev->name),
+				  btrfs_dev_name(swarn->dev),
 				  swarn->physical,
 				  root, inum, offset,
 				  fs_info->sectorsize, nlink,
@@ -891,7 +890,7 @@ err:
 	btrfs_warn_in_rcu(fs_info,
 			  "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
 			  swarn->errstr, swarn->logical,
-			  rcu_str_deref(swarn->dev->name),
+			  btrfs_dev_name(swarn->dev),
 			  swarn->physical,
 			  root, inum, offset, ret);
 
@@ -922,8 +921,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 	/* Super block error, no need to search extent tree. */
 	if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
 		btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu",
-			errstr, rcu_str_deref(dev->name),
-			sblock->physical);
+			errstr, btrfs_dev_name(dev), sblock->physical);
 		return;
 	}
 	path = btrfs_alloc_path();
@@ -954,7 +952,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 			btrfs_warn_in_rcu(fs_info,
 "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
 				errstr, swarn.logical,
-				rcu_str_deref(dev->name),
+				btrfs_dev_name(dev),
 				swarn.physical,
 				ref_level ? "node" : "leaf",
 				ret < 0 ? -1 : ref_level,
@@ -1377,7 +1375,7 @@ corrected_error:
 			spin_unlock(&sctx->stat_lock);
 			btrfs_err_rl_in_rcu(fs_info,
 				"fixed up error at logical %llu on dev %s",
-				logical, rcu_str_deref(dev->name));
+				logical, btrfs_dev_name(dev));
 		}
 	} else {
 did_not_correct_error:
@@ -1386,7 +1384,7 @@ did_not_correct_error:
 		spin_unlock(&sctx->stat_lock);
 		btrfs_err_rl_in_rcu(fs_info,
 			"unable to fixup (regular) error at logical %llu on dev %s",
-			logical, rcu_str_deref(dev->name));
+			logical, btrfs_dev_name(dev));
 	}
 
 out:
@@ -2332,14 +2330,14 @@ static void scrub_missing_raid56_worker(struct work_struct *work)
 		spin_unlock(&sctx->stat_lock);
 		btrfs_err_rl_in_rcu(fs_info,
 			"IO error rebuilding logical %llu for dev %s",
-			logical, rcu_str_deref(dev->name));
+			logical, btrfs_dev_name(dev));
 	} else if (sblock->header_error || sblock->checksum_error) {
 		spin_lock(&sctx->stat_lock);
 		sctx->stat.uncorrectable_errors++;
 		spin_unlock(&sctx->stat_lock);
 		btrfs_err_rl_in_rcu(fs_info,
 			"failed to rebuild valid logical %llu for dev %s",
-			logical, rcu_str_deref(dev->name));
+			logical, btrfs_dev_name(dev));
 	} else {
 		scrub_write_block_to_dev_replace(sblock);
 	}
@@ -4303,7 +4301,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 		btrfs_err_in_rcu(fs_info,
 			"scrub on devid %llu: filesystem on %s is not writable",
-				 devid, rcu_str_deref(dev->name));
+				 devid, btrfs_dev_name(dev));
 		ret = -EROFS;
 		goto out;
 	}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d54bfec8e506..ea83dd9f735a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2336,7 +2336,7 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
 	 * the end of RCU grace period.
 	 */
 	rcu_read_lock();
-	seq_escape(m, rcu_str_deref(fs_info->fs_devices->latest_dev->name), " \t\n\\");
+	seq_escape(m, btrfs_dev_name(fs_info->fs_devices->latest_dev), " \t\n\\");
 	rcu_read_unlock();
 
 	return 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 49d6e79d4343..2d0950a17f48 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -941,7 +941,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
 			}
 			btrfs_info_in_rcu(NULL,
 	"devid %llu device path %s changed to %s scanned by %s (%d)",
-					  devid, rcu_str_deref(device->name),
+					  devid, btrfs_dev_name(device),
 					  path, current->comm,
 					  task_pid_nr(current));
 		}
@@ -2101,7 +2101,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 	if (btrfs_pinned_by_swapfile(fs_info, device)) {
 		btrfs_warn_in_rcu(fs_info,
 		  "cannot remove device %s (devid %llu) due to active swapfile",
-				  rcu_str_deref(device->name), device->devid);
+				  btrfs_dev_name(device), device->devid);
 		return -ETXTBSY;
 	}
 
@@ -6827,7 +6827,7 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
 	btrfs_debug_in_rcu(dev->fs_info,
 	"%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
 		__func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
-		(unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
+		(unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev),
 		dev->devid, bio->bi_iter.bi_size);
 
 	btrfsic_check_bio(bio);
@@ -7908,7 +7908,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
 	if (ret < 0) {
 		btrfs_warn_in_rcu(fs_info,
 			"error %d while searching for dev_stats item for device %s",
-			      ret, rcu_str_deref(device->name));
+				  ret, btrfs_dev_name(device));
 		goto out;
 	}
 
@@ -7919,7 +7919,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
 		if (ret != 0) {
 			btrfs_warn_in_rcu(fs_info,
 				"delete too small dev_stats item for device %s failed %d",
-				      rcu_str_deref(device->name), ret);
+					  btrfs_dev_name(device), ret);
 			goto out;
 		}
 		ret = 1;
@@ -7933,7 +7933,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
 		if (ret < 0) {
 			btrfs_warn_in_rcu(fs_info,
 				"insert dev_stats item for device %s failed %d",
-				rcu_str_deref(device->name), ret);
+				btrfs_dev_name(device), ret);
 			goto out;
 		}
 	}
@@ -7998,7 +7998,7 @@ void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
 		return;
 	btrfs_err_rl_in_rcu(dev->fs_info,
 		"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
-			   rcu_str_deref(dev->name),
+			   btrfs_dev_name(dev),
 			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
 			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
 			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
@@ -8018,7 +8018,7 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
 
 	btrfs_info_in_rcu(dev->fs_info,
 		"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
-	       rcu_str_deref(dev->name),
+	       btrfs_dev_name(dev),
 	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
 	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
 	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index efa6a3d48cd8..2c90e50c460a 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -12,6 +12,7 @@
 #include "async-thread.h"
 #include "messages.h"
 #include "disk-io.h"
+#include "rcu-string.h"
 
 #define BTRFS_MAX_DATA_CHUNK_SIZE	(10ULL * SZ_1G)
 
@@ -770,6 +771,14 @@ static inline void btrfs_dev_stat_set(struct btrfs_device *dev,
 	atomic_inc(&dev->dev_stats_ccnt);
 }
 
+static inline const char *btrfs_dev_name(const struct btrfs_device *device)
+{
+	if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
+		return "<missing disk>";
+	else
+		return rcu_str_deref(device->name);
+}
+
 void btrfs_commit_device_sizes(struct btrfs_transaction *trans);
 
 struct list_head * __attribute_const__ btrfs_get_fs_uuids(void);
-- 
cgit 


From 9f0eac070d23405f18e7a84820bc3d59b1415bec Mon Sep 17 00:00:00 2001
From: Li zeming <zeming@nfschina.com>
Date: Wed, 26 Oct 2022 09:36:11 +0800
Subject: btrfs: allocate btrfs_io_context without GFP_NOFAIL

The __GFP_NOFAIL flag could loop indefinitely when allocation memory in
alloc_btrfs_io_context. The callers starting from __btrfs_map_block
already handle errors so it's safe to drop the flag.

Signed-off-by: Li zeming <zeming@nfschina.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2d0950a17f48..e51fd5f1042a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5899,7 +5899,10 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_
 		 * and the stripes.
 		 */
 		sizeof(u64) * (total_stripes),
-		GFP_NOFS|__GFP_NOFAIL);
+		GFP_NOFS);
+
+	if (!bioc)
+		return NULL;
 
 	refcount_set(&bioc->refs, 1);
 
-- 
cgit 


From cb649e81dad429ffbd97a689ac0011599952a668 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 14 Nov 2022 08:26:31 +0800
Subject: btrfs: refactor checksum calculations in btrfs_lookup_csums_range()

The refactoring involves the following parts:

- Introduce bytes_to_csum_size() and csum_size_to_bytes() helpers
  As we have quite some open-coded calculations, some of them are even
  split into two assignments just to fit 80 chars limit.

- Remove the @csum_size parameter from max_ordered_sum_bytes()
  Csum size can be fetched from @fs_info.
  And we will use the csum_size_to_bytes() helper anyway.

- Add a comment explaining how we handle the first search result

- Use newly introduced helpers to cleanup btrfs_lookup_csums_range()

- Move variables declaration to the minimal scope

- Never mix number of sectors with bytes
  There are several locations doing things like:

 			size = min_t(size_t, csum_end - start,
				     max_ordered_sum_bytes(fs_info));
			...
			size >>= fs_info->sectorsize_bits

  Or

			offset = (start - key.offset) >> fs_info->sectorsize_bits;
			offset *= csum_size;

  Make sure these variables can only represent BYTES inside the
  function, by using the above bytes_to_csum_size() helpers.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file-item.c | 68 ++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 47 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 456f71b42a9c..654ad2494bce 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -126,12 +126,26 @@ int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
 				start + len - 1, EXTENT_DIRTY, NULL);
 }
 
-static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info,
-					u16 csum_size)
+static size_t bytes_to_csum_size(const struct btrfs_fs_info *fs_info, u32 bytes)
 {
-	u32 ncsums = (PAGE_SIZE - sizeof(struct btrfs_ordered_sum)) / csum_size;
+	ASSERT(IS_ALIGNED(bytes, fs_info->sectorsize));
 
-	return ncsums * fs_info->sectorsize;
+	return (bytes >> fs_info->sectorsize_bits) * fs_info->csum_size;
+}
+
+static size_t csum_size_to_bytes(const struct btrfs_fs_info *fs_info, u32 csum_size)
+{
+	ASSERT(IS_ALIGNED(csum_size, fs_info->csum_size));
+
+	return (csum_size / fs_info->csum_size) << fs_info->sectorsize_bits;
+}
+
+static inline u32 max_ordered_sum_bytes(const struct btrfs_fs_info *fs_info)
+{
+	u32 max_csum_size = round_down(PAGE_SIZE - sizeof(struct btrfs_ordered_sum),
+				       fs_info->csum_size);
+
+	return csum_size_to_bytes(fs_info, max_csum_size);
 }
 
 /*
@@ -140,9 +154,7 @@ static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info,
  */
 static int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info, unsigned long bytes)
 {
-	int num_sectors = (int)DIV_ROUND_UP(bytes, fs_info->sectorsize);
-
-	return sizeof(struct btrfs_ordered_sum) + num_sectors * fs_info->csum_size;
+	return sizeof(struct btrfs_ordered_sum) + bytes_to_csum_size(fs_info, bytes);
 }
 
 int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
@@ -526,11 +538,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 	struct btrfs_ordered_sum *sums;
 	struct btrfs_csum_item *item;
 	LIST_HEAD(tmplist);
-	unsigned long offset;
 	int ret;
-	size_t size;
-	u64 csum_end;
-	const u32 csum_size = fs_info->csum_size;
 
 	ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
 	       IS_ALIGNED(end + 1, fs_info->sectorsize));
@@ -556,16 +564,33 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 	if (ret > 0 && path->slots[0] > 0) {
 		leaf = path->nodes[0];
 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
+
+		/*
+		 * There are two cases we can hit here for the previous csum
+		 * item:
+		 *
+		 *		|<- search range ->|
+		 *	|<- csum item ->|
+		 *
+		 * Or
+		 *				|<- search range ->|
+		 *	|<- csum item ->|
+		 *
+		 * Check if the previous csum item covers the leading part of
+		 * the search range.  If so we have to start from previous csum
+		 * item.
+		 */
 		if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
 		    key.type == BTRFS_EXTENT_CSUM_KEY) {
-			offset = (start - key.offset) >> fs_info->sectorsize_bits;
-			if (offset * csum_size <
+			if (bytes_to_csum_size(fs_info, start - key.offset) <
 			    btrfs_item_size(leaf, path->slots[0] - 1))
 				path->slots[0]--;
 		}
 	}
 
 	while (start <= end) {
+		u64 csum_end;
+
 		leaf = path->nodes[0];
 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
 			ret = btrfs_next_leaf(root, path);
@@ -585,8 +610,8 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 		if (key.offset > start)
 			start = key.offset;
 
-		size = btrfs_item_size(leaf, path->slots[0]);
-		csum_end = key.offset + (size / csum_size) * fs_info->sectorsize;
+		csum_end = key.offset + csum_size_to_bytes(fs_info,
+					btrfs_item_size(leaf, path->slots[0]));
 		if (csum_end <= start) {
 			path->slots[0]++;
 			continue;
@@ -596,8 +621,11 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 		item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 				      struct btrfs_csum_item);
 		while (start < csum_end) {
+			unsigned long offset;
+			size_t size;
+
 			size = min_t(size_t, csum_end - start,
-				     max_ordered_sum_bytes(fs_info, csum_size));
+				     max_ordered_sum_bytes(fs_info));
 			sums = kzalloc(btrfs_ordered_sum_size(fs_info, size),
 				       GFP_NOFS);
 			if (!sums) {
@@ -608,16 +636,14 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 			sums->bytenr = start;
 			sums->len = (int)size;
 
-			offset = (start - key.offset) >> fs_info->sectorsize_bits;
-			offset *= csum_size;
-			size >>= fs_info->sectorsize_bits;
+			offset = bytes_to_csum_size(fs_info, start - key.offset);
 
 			read_extent_buffer(path->nodes[0],
 					   sums->sums,
 					   ((unsigned long)item) + offset,
-					   csum_size * size);
+					   bytes_to_csum_size(fs_info, size));
 
-			start += fs_info->sectorsize * size;
+			start += size;
 			list_add_tail(&sums->list, &tmplist);
 		}
 		path->slots[0]++;
-- 
cgit 


From 97e3823933108cfc648bb08d5ad36251b6588164 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 14 Nov 2022 08:26:32 +0800
Subject: btrfs: introduce a bitmap based csum range search function

Although we have an existing function, btrfs_lookup_csums_range(), to
find all data checksums for a range, it's based on a btrfs_ordered_sum
list.

For the incoming RAID56 data checksum verification at RMW time, we don't
want to waste time by allocating temporary memory.

So this patch will introduce a new helper, btrfs_lookup_csums_bitmap().
It will use bitmap based result, which will be a perfect fit for later
RAID56 usage.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file-item.c  | 127 ++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/file-item.h  |   8 ++--
 fs/btrfs/inode.c      |   5 +-
 fs/btrfs/relocation.c |   4 +-
 fs/btrfs/scrub.c      |   8 ++--
 fs/btrfs/tree-log.c   |  15 +++---
 6 files changed, 144 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 654ad2494bce..352bbb33b76f 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -527,9 +527,9 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
 	return ret;
 }
 
-int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
-			     struct list_head *list, int search_commit,
-			     bool nowait)
+int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
+			    struct list_head *list, int search_commit,
+			    bool nowait)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_key key;
@@ -661,6 +661,127 @@ fail:
 	return ret;
 }
 
+/*
+ * Do the same work as btrfs_lookup_csums_list(), the difference is in how
+ * we return the result.
+ *
+ * This version will set the corresponding bits in @csum_bitmap to represent
+ * that there is a csum found.
+ * Each bit represents a sector. Thus caller should ensure @csum_buf passed
+ * in is large enough to contain all csums.
+ */
+int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end,
+			      u8 *csum_buf, unsigned long *csum_bitmap)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_csum_item *item;
+	const u64 orig_start = start;
+	int ret;
+
+	ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
+	       IS_ALIGNED(end + 1, fs_info->sectorsize));
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+	key.type = BTRFS_EXTENT_CSUM_KEY;
+	key.offset = start;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto fail;
+	if (ret > 0 && path->slots[0] > 0) {
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
+
+		/*
+		 * There are two cases we can hit here for the previous csum
+		 * item:
+		 *
+		 *		|<- search range ->|
+		 *	|<- csum item ->|
+		 *
+		 * Or
+		 *				|<- search range ->|
+		 *	|<- csum item ->|
+		 *
+		 * Check if the previous csum item covers the leading part of
+		 * the search range.  If so we have to start from previous csum
+		 * item.
+		 */
+		if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
+		    key.type == BTRFS_EXTENT_CSUM_KEY) {
+			if (bytes_to_csum_size(fs_info, start - key.offset) <
+			    btrfs_item_size(leaf, path->slots[0] - 1))
+				path->slots[0]--;
+		}
+	}
+
+	while (start <= end) {
+		u64 csum_end;
+
+		leaf = path->nodes[0];
+		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto fail;
+			if (ret > 0)
+				break;
+			leaf = path->nodes[0];
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+		    key.type != BTRFS_EXTENT_CSUM_KEY ||
+		    key.offset > end)
+			break;
+
+		if (key.offset > start)
+			start = key.offset;
+
+		csum_end = key.offset + csum_size_to_bytes(fs_info,
+					btrfs_item_size(leaf, path->slots[0]));
+		if (csum_end <= start) {
+			path->slots[0]++;
+			continue;
+		}
+
+		csum_end = min(csum_end, end + 1);
+		item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				      struct btrfs_csum_item);
+		while (start < csum_end) {
+			unsigned long offset;
+			size_t size;
+			u8 *csum_dest = csum_buf + bytes_to_csum_size(fs_info,
+						start - orig_start);
+
+			size = min_t(size_t, csum_end - start, end + 1 - start);
+
+			offset = bytes_to_csum_size(fs_info, start - key.offset);
+
+			read_extent_buffer(path->nodes[0], csum_dest,
+					   ((unsigned long)item) + offset,
+					   bytes_to_csum_size(fs_info, size));
+
+			bitmap_set(csum_bitmap,
+				(start - orig_start) >> fs_info->sectorsize_bits,
+				size >> fs_info->sectorsize_bits);
+
+			start += size;
+		}
+		path->slots[0]++;
+	}
+	ret = 0;
+fail:
+	btrfs_free_path(path);
+	return ret;
+}
+
 /*
  * Calculate checksums of the data contained inside a bio.
  *
diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h
index ba12711cf933..95b371208b30 100644
--- a/fs/btrfs/file-item.h
+++ b/fs/btrfs/file-item.h
@@ -18,9 +18,11 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 			   struct btrfs_ordered_sum *sums);
 blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
 				u64 offset, bool one_ordered);
-int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
-			     struct list_head *list, int search_commit,
-			     bool nowait);
+int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
+			    struct list_head *list, int search_commit,
+			    bool nowait);
+int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end,
+			      u8 *csum_buf, unsigned long *csum_bitmap);
 void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
 				     const struct btrfs_path *path,
 				     struct btrfs_file_extent_item *fi,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 83898bca39d5..4248e6cabbdc 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1709,9 +1709,8 @@ static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
 	int ret;
 	LIST_HEAD(list);
 
-	ret = btrfs_lookup_csums_range(csum_root, bytenr,
-				       bytenr + num_bytes - 1, &list, 0,
-				       nowait);
+	ret = btrfs_lookup_csums_list(csum_root, bytenr, bytenr + num_bytes - 1,
+				      &list, 0, nowait);
 	if (ret == 0 && list_empty(&list))
 		return 0;
 
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 56c8afa6f6d2..aa80e51bc8ca 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4357,8 +4357,8 @@ int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len)
 
 	disk_bytenr = file_pos + inode->index_cnt;
 	csum_root = btrfs_csum_root(fs_info, disk_bytenr);
-	ret = btrfs_lookup_csums_range(csum_root, disk_bytenr,
-				       disk_bytenr + len - 1, &list, 0, false);
+	ret = btrfs_lookup_csums_list(csum_root, disk_bytenr,
+				      disk_bytenr + len - 1, &list, 0, false);
 	if (ret)
 		goto out;
 
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 0ce5b773867f..52b346795f66 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3238,9 +3238,9 @@ static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx,
 		extent_dev = bioc->stripes[0].dev;
 		btrfs_put_bioc(bioc);
 
-		ret = btrfs_lookup_csums_range(csum_root, extent_start,
-					       extent_start + extent_size - 1,
-					       &sctx->csum_list, 1, false);
+		ret = btrfs_lookup_csums_list(csum_root, extent_start,
+					      extent_start + extent_size - 1,
+					      &sctx->csum_list, 1, false);
 		if (ret) {
 			scrub_parity_mark_sectors_error(sparity, extent_start,
 							extent_size);
@@ -3464,7 +3464,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
 			    cur_logical;
 
 		if (extent_flags & BTRFS_EXTENT_FLAG_DATA) {
-			ret = btrfs_lookup_csums_range(csum_root, cur_logical,
+			ret = btrfs_lookup_csums_list(csum_root, cur_logical,
 					cur_logical + scrub_len - 1,
 					&sctx->csum_list, 1, false);
 			if (ret)
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4d9f6803bfbe..8f8d7e7dc3a8 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -826,7 +826,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 					btrfs_file_extent_num_bytes(eb, item);
 			}
 
-			ret = btrfs_lookup_csums_range(root->log_root,
+			ret = btrfs_lookup_csums_list(root->log_root,
 						csum_start, csum_end - 1,
 						&ordered_sums, 0, false);
 			if (ret)
@@ -4443,9 +4443,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 
 		csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr);
 		disk_bytenr += extent_offset;
-		ret = btrfs_lookup_csums_range(csum_root, disk_bytenr,
-					       disk_bytenr + extent_num_bytes - 1,
-					       &ordered_sums, 0, false);
+		ret = btrfs_lookup_csums_list(csum_root, disk_bytenr,
+					      disk_bytenr + extent_num_bytes - 1,
+					      &ordered_sums, 0, false);
 		if (ret)
 			goto out;
 
@@ -4638,10 +4638,9 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
 
 	/* block start is already adjusted for the file extent offset. */
 	csum_root = btrfs_csum_root(trans->fs_info, em->block_start);
-	ret = btrfs_lookup_csums_range(csum_root,
-				       em->block_start + csum_offset,
-				       em->block_start + csum_offset +
-				       csum_len - 1, &ordered_sums, 0, false);
+	ret = btrfs_lookup_csums_list(csum_root, em->block_start + csum_offset,
+				      em->block_start + csum_offset +
+				      csum_len - 1, &ordered_sums, 0, false);
 	if (ret)
 		return ret;
 
-- 
cgit 


From c5a415627be4758ebdd274de7148706d6713c7ec Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 14 Nov 2022 08:26:33 +0800
Subject: btrfs: raid56: prepare data checksums for later RMW verification

This is for later data checksum verification at RMW time.

This patch will try to allocate the needed memory for a locked rbio if
the rbio is for data exclusively (we don't want to handle mixed bg yet).
The memory will be released when the rbio is finished.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/raid56.h | 12 +++++++++
 2 files changed, 86 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 11be5d0a7eab..5ef4fbb49df2 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -20,6 +20,7 @@
 #include "volumes.h"
 #include "raid56.h"
 #include "async-thread.h"
+#include "file-item.h"
 
 /* set when additional merges to this rbio are not allowed */
 #define RBIO_RMW_LOCKED_BIT	1
@@ -835,6 +836,11 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
 	struct bio *cur = bio_list_get(&rbio->bio_list);
 	struct bio *extra;
 
+	kfree(rbio->csum_buf);
+	bitmap_free(rbio->csum_bitmap);
+	rbio->csum_buf = NULL;
+	rbio->csum_bitmap = NULL;
+
 	/*
 	 * Clear the data bitmap, as the rbio may be cached for later usage.
 	 * do this before before unlock_stripe() so there will be no new bio
@@ -2048,6 +2054,67 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
 	start_async_work(rbio, recover_rbio_work);
 }
 
+static void fill_data_csums(struct btrfs_raid_bio *rbio)
+{
+	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
+	struct btrfs_root *csum_root = btrfs_csum_root(fs_info,
+						       rbio->bioc->raid_map[0]);
+	const u64 start = rbio->bioc->raid_map[0];
+	const u32 len = (rbio->nr_data * rbio->stripe_nsectors) <<
+			fs_info->sectorsize_bits;
+	int ret;
+
+	/* The rbio should not have its csum buffer initialized. */
+	ASSERT(!rbio->csum_buf && !rbio->csum_bitmap);
+
+	/*
+	 * Skip the csum search if:
+	 *
+	 * - The rbio doesn't belong to data block groups
+	 *   Then we are doing IO for tree blocks, no need to search csums.
+	 *
+	 * - The rbio belongs to mixed block groups
+	 *   This is to avoid deadlock, as we're already holding the full
+	 *   stripe lock, if we trigger a metadata read, and it needs to do
+	 *   raid56 recovery, we will deadlock.
+	 */
+	if (!(rbio->bioc->map_type & BTRFS_BLOCK_GROUP_DATA) ||
+	    rbio->bioc->map_type & BTRFS_BLOCK_GROUP_METADATA)
+		return;
+
+	rbio->csum_buf = kzalloc(rbio->nr_data * rbio->stripe_nsectors *
+				 fs_info->csum_size, GFP_NOFS);
+	rbio->csum_bitmap = bitmap_zalloc(rbio->nr_data * rbio->stripe_nsectors,
+					  GFP_NOFS);
+	if (!rbio->csum_buf || !rbio->csum_bitmap) {
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	ret = btrfs_lookup_csums_bitmap(csum_root, start, start + len - 1,
+					rbio->csum_buf, rbio->csum_bitmap);
+	if (ret < 0)
+		goto error;
+	if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits))
+		goto no_csum;
+	return;
+
+error:
+	/*
+	 * We failed to allocate memory or grab the csum, but it's not fatal,
+	 * we can still continue.  But better to warn users that RMW is no
+	 * longer safe for this particular sub-stripe write.
+	 */
+	btrfs_warn_rl(fs_info,
+"sub-stripe write for full stripe %llu is not safe, failed to get csum: %d",
+			rbio->bioc->raid_map[0], ret);
+no_csum:
+	kfree(rbio->csum_buf);
+	bitmap_free(rbio->csum_bitmap);
+	rbio->csum_buf = NULL;
+	rbio->csum_bitmap = NULL;
+}
+
 static int rmw_read_and_wait(struct btrfs_raid_bio *rbio)
 {
 	struct bio_list bio_list;
@@ -2056,6 +2123,13 @@ static int rmw_read_and_wait(struct btrfs_raid_bio *rbio)
 
 	bio_list_init(&bio_list);
 
+	/*
+	 * Fill the data csums we need for data verification.  We need to fill
+	 * the csum_bitmap/csum_buf first, as our endio function will try to
+	 * verify the data sectors.
+	 */
+	fill_data_csums(rbio);
+
 	ret = rmw_assemble_read_bios(rbio, &bio_list);
 	if (ret < 0)
 		goto out;
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index a2e653e93fd8..7c73a443939e 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -129,6 +129,18 @@ struct btrfs_raid_bio {
 	 * Thus making it much harder to iterate.
 	 */
 	unsigned long *error_bitmap;
+
+	/*
+	 * Checksum buffer if the rbio is for data.  The buffer should cover
+	 * all data sectors (exlcuding P/Q sectors).
+	 */
+	u8 *csum_buf;
+
+	/*
+	 * Each bit represents if the corresponding sector has data csum found.
+	 * Should only cover data sectors (excluding P/Q sectors).
+	 */
+	unsigned long *csum_bitmap;
 };
 
 /*
-- 
cgit 


From 7a3150723061ba1ac2ba10b1392b1cd75234172a Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 14 Nov 2022 08:26:34 +0800
Subject: btrfs: raid56: do data csum verification during RMW cycle

[BUG]
For the following small script, btrfs will be unable to recover the
content of file1:

  mkfs.btrfs -f -m raid1 -d raid5 -b 1G $dev1 $dev2 $dev3

  mount $dev1 $mnt
  xfs_io -f -c "pwrite -S 0xff 0 64k" -c sync $mnt/file1
  md5sum $mnt/file1
  umount $mnt

  # Corrupt the above 64K data stripe.
  xfs_io -f -c "pwrite -S 0x00 323026944 64K" -c sync $dev3
  mount $dev1 $mnt

  # Write a new 64K, which should be in the other data stripe
  # And this is a sub-stripe write, which will cause RMW
  xfs_io -f -c "pwrite 0 64k" -c sync $mnt/file2
  md5sum $mnt/file1
  umount $mnt

Above md5sum would fail.

[CAUSE]
There is a long existing problem for raid56 (not limited to btrfs
raid56) that, if we already have some corrupted on-disk data, and then
trigger a sub-stripe write (which needs RMW cycle), it can cause further
damage into P/Q stripe.

  Disk 1: data 1 |0x000000000000| <- Corrupted
  Disk 2: data 2 |0x000000000000|
  Disk 2: parity |0xffffffffffff|

In above case, data 1 is already corrupted, the original data should be
64KiB of 0xff.

At this stage, if we read data 1, and it has data checksum, we can still
recovery going via the regular RAID56 recovery path.

But if now we decide to write some data into data 2, then we need to go
RMW.
Let's say we want to write 64KiB of '0x00' into data 2, then we read the
on-disk data of data 1, calculate the new parity, resulting the
following layout:

  Disk 1: data 1 |0x000000000000| <- Corrupted
  Disk 2: data 2 |0x000000000000| <- New '0x00' writes
  Disk 2: parity |0x000000000000| <- New Parity.

But the new parity is calculated using the *corrupted* data 1, we can
no longer recover the correct data of data1.  Thus the corruption is
forever there.

[FIX]
To solve above problem, this patch will do a full stripe data checksum
verification at RMW time.

This involves the following changes:

- Always read the full stripe (including data/P/Q) when doing RMW
  Before we only read the missing data sectors, but since we may do a
  data csum verification and recovery, we need to read everything out.

  Please note that, if we have a cached rbio, we don't need to read
  anything, and can treat it the same as full stripe write.

  As only stripe with all its csum matches can be cached.

- Verify the data csum during read.
  The goal is only the rbio stripe sectors, and only if the rbio
  already has csum_buf/csum_bitmap filled.

  And sectors which cannot pass csum verification will have their bit
  set in error_bitmap.

- Always call recovery_sectors() after we read out all the sectors
  Since error_bitmap will be updated during read, recover_sectors()
  can easily find out all the bad sectors and try to recover (if still
  under tolerance).

  And since recovery_sectors() is already migrated to use error_bitmap,
  it can skip vertical stripes which don't have any error.

- Verify the repaired sectors against its csum in recover_vertical()

- Rename rmw_read_and_wait() to rmw_read_wait_recover()
  Since we will always recover the sectors, the old name is no longer
  accurate.

  Furthermore since recovery is already done in rmw_read_wait_recover(),
  we no longer need to call recovery_sectors() inside rmw_rbio().

Obviously this will have a performance impact, as we are doing more
work during RMW cycle:

- Fetch the data checksums
- Do checksum verification for all data stripes
- Do checksum verification again after repair

But for full stripe write or cached rbio we won't have the overhead all,
thus for fully optimized RAID56 workload (always full stripe write),
there should be no extra overhead.

To me, the extra overhead looks reasonable, as data consistency is way
more important than performance.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c | 169 +++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 137 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 5ef4fbb49df2..2d90a6b5eb00 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -21,6 +21,7 @@
 #include "raid56.h"
 #include "async-thread.h"
 #include "file-item.h"
+#include "btrfs_inode.h"
 
 /* set when additional merges to this rbio are not allowed */
 #define RBIO_RMW_LOCKED_BIT	1
@@ -1433,14 +1434,56 @@ static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bi
 		   bio_size >> rbio->bioc->fs_info->sectorsize_bits);
 }
 
+/* Verify the data sectors at read time. */
+static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio,
+				    struct bio *bio)
+{
+	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
+	int total_sector_nr = get_bio_sector_nr(rbio, bio);
+	struct bio_vec *bvec;
+	struct bvec_iter_all iter_all;
+
+	/* No data csum for the whole stripe, no need to verify. */
+	if (!rbio->csum_bitmap || !rbio->csum_buf)
+		return;
+
+	/* P/Q stripes, they have no data csum to verify against. */
+	if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors)
+		return;
+
+	bio_for_each_segment_all(bvec, bio, iter_all) {
+		int bv_offset;
+
+		for (bv_offset = bvec->bv_offset;
+		     bv_offset < bvec->bv_offset + bvec->bv_len;
+		     bv_offset += fs_info->sectorsize, total_sector_nr++) {
+			u8 csum_buf[BTRFS_CSUM_SIZE];
+			u8 *expected_csum = rbio->csum_buf +
+					    total_sector_nr * fs_info->csum_size;
+			int ret;
+
+			/* No csum for this sector, skip to the next sector. */
+			if (!test_bit(total_sector_nr, rbio->csum_bitmap))
+				continue;
+
+			ret = btrfs_check_sector_csum(fs_info, bvec->bv_page,
+				bv_offset, csum_buf, expected_csum);
+			if (ret < 0)
+				set_bit(total_sector_nr, rbio->error_bitmap);
+		}
+	}
+}
+
 static void raid_wait_read_end_io(struct bio *bio)
 {
 	struct btrfs_raid_bio *rbio = bio->bi_private;
 
-	if (bio->bi_status)
+	if (bio->bi_status) {
 		rbio_update_error_bitmap(rbio, bio);
-	else
+	} else {
 		set_bio_pages_uptodate(rbio, bio);
+		verify_bio_data_sectors(rbio, bio);
+	}
 
 	bio_put(bio);
 	if (atomic_dec_and_test(&rbio->stripes_pending))
@@ -1469,37 +1512,25 @@ static void submit_read_bios(struct btrfs_raid_bio *rbio,
 static int rmw_assemble_read_bios(struct btrfs_raid_bio *rbio,
 				  struct bio_list *bio_list)
 {
-	const int nr_data_sectors = rbio->stripe_nsectors * rbio->nr_data;
 	struct bio *bio;
 	int total_sector_nr;
 	int ret = 0;
 
 	ASSERT(bio_list_size(bio_list) == 0);
 
-	/* Build a list of bios to read all the missing data sectors. */
-	for (total_sector_nr = 0; total_sector_nr < nr_data_sectors;
+	/*
+	 * Build a list of bios to read all sectors (including data and P/Q).
+	 *
+	 * This behaviro is to compensate the later csum verification and
+	 * recovery.
+	 */
+	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
 	     total_sector_nr++) {
 		struct sector_ptr *sector;
 		int stripe = total_sector_nr / rbio->stripe_nsectors;
 		int sectornr = total_sector_nr % rbio->stripe_nsectors;
 
-		/*
-		 * We want to find all the sectors missing from the rbio and
-		 * read them from the disk.  If sector_in_rbio() finds a page
-		 * in the bio list we don't need to read it off the stripe.
-		 */
-		sector = sector_in_rbio(rbio, stripe, sectornr, 1);
-		if (sector)
-			continue;
-
 		sector = rbio_stripe_sector(rbio, stripe, sectornr);
-		/*
-		 * The bio cache may have handed us an uptodate page.  If so,
-		 * use it.
-		 */
-		if (sector->uptodate)
-			continue;
-
 		ret = rbio_add_io_sector(rbio, bio_list, sector,
 			       stripe, sectornr, REQ_OP_READ);
 		if (ret)
@@ -1670,6 +1701,42 @@ fail:
 	bio_endio(bio);
 }
 
+static int verify_one_sector(struct btrfs_raid_bio *rbio,
+			     int stripe_nr, int sector_nr)
+{
+	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
+	struct sector_ptr *sector;
+	u8 csum_buf[BTRFS_CSUM_SIZE];
+	u8 *csum_expected;
+	int ret;
+
+	if (!rbio->csum_bitmap || !rbio->csum_buf)
+		return 0;
+
+	/* No way to verify P/Q as they are not covered by data csum. */
+	if (stripe_nr >= rbio->nr_data)
+		return 0;
+	/*
+	 * If we're rebuilding a read, we have to use pages from the
+	 * bio list if possible.
+	 */
+	if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+	     rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) {
+		sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
+	} else {
+		sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
+	}
+
+	ASSERT(sector->page);
+
+	csum_expected = rbio->csum_buf +
+			(stripe_nr * rbio->stripe_nsectors + sector_nr) *
+			fs_info->csum_size;
+	ret = btrfs_check_sector_csum(fs_info, sector->page, sector->pgoff,
+				      csum_buf, csum_expected);
+	return ret;
+}
+
 /*
  * Recover a vertical stripe specified by @sector_nr.
  * @*pointers are the pre-allocated pointers by the caller, so we don't
@@ -1685,6 +1752,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
 	int faila;
 	int failb;
 	int stripe_nr;
+	int ret = 0;
 
 	/*
 	 * Now we just use bitmap to mark the horizontal stripes in
@@ -1805,12 +1873,23 @@ pstripe:
 	 * uptodate.
 	 * Especially if we determine to cache the rbio, we need to
 	 * have at least all data sectors uptodate.
+	 *
+	 * If possible, also check if the repaired sector matches its data
+	 * checksum.
 	 */
 	if (faila >= 0) {
+		ret = verify_one_sector(rbio, faila, sector_nr);
+		if (ret < 0)
+			goto cleanup;
+
 		sector = rbio_stripe_sector(rbio, faila, sector_nr);
 		sector->uptodate = 1;
 	}
 	if (failb >= 0) {
+		ret = verify_one_sector(rbio, faila, sector_nr);
+		if (ret < 0)
+			goto cleanup;
+
 		sector = rbio_stripe_sector(rbio, failb, sector_nr);
 		sector->uptodate = 1;
 	}
@@ -1818,7 +1897,7 @@ pstripe:
 cleanup:
 	for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--)
 		kunmap_local(unmap_array[stripe_nr]);
-	return 0;
+	return ret;
 }
 
 static int recover_sectors(struct btrfs_raid_bio *rbio)
@@ -2115,7 +2194,7 @@ no_csum:
 	rbio->csum_bitmap = NULL;
 }
 
-static int rmw_read_and_wait(struct btrfs_raid_bio *rbio)
+static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio)
 {
 	struct bio_list bio_list;
 	struct bio *bio;
@@ -2136,6 +2215,12 @@ static int rmw_read_and_wait(struct btrfs_raid_bio *rbio)
 
 	submit_read_bios(rbio, &bio_list);
 	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
+
+	/*
+	 * We may or may not have any corrupted sectors (including missing dev
+	 * and csum mismatch), just let recover_sectors() to handle them all.
+	 */
+	ret = recover_sectors(rbio);
 	return ret;
 out:
 	while ((bio = bio_list_pop(&bio_list)))
@@ -2175,6 +2260,28 @@ static void submit_write_bios(struct btrfs_raid_bio *rbio,
 	}
 }
 
+/*
+ * To determine if we need to read any sector from the disk.
+ * Should only be utilized in RMW path, to skip cached rbio.
+ */
+static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio)
+{
+	int i;
+
+	for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) {
+		struct sector_ptr *sector = &rbio->stripe_sectors[i];
+
+		/*
+		 * We have a sector which doesn't have page nor uptodate,
+		 * thus this rbio can not be cached one, as cached one must
+		 * have all its data sectors present and uptodate.
+		 */
+		if (!sector->page || !sector->uptodate)
+			return true;
+	}
+	return false;
+}
+
 static int rmw_rbio(struct btrfs_raid_bio *rbio)
 {
 	struct bio_list bio_list;
@@ -2189,9 +2296,13 @@ static int rmw_rbio(struct btrfs_raid_bio *rbio)
 	if (ret < 0)
 		return ret;
 
-	/* Full stripe write, can write the full stripe right now. */
-	if (rbio_is_full(rbio))
+	/*
+	 * Either full stripe write, or we have every data sector already
+	 * cached, can go to write path immediately.
+	 */
+	if (rbio_is_full(rbio) || !need_read_stripe_sectors(rbio))
 		goto write;
+
 	/*
 	 * Now we're doing sub-stripe write, also need all data stripes to do
 	 * the full RMW.
@@ -2202,16 +2313,10 @@ static int rmw_rbio(struct btrfs_raid_bio *rbio)
 
 	index_rbio_pages(rbio);
 
-	ret = rmw_read_and_wait(rbio);
+	ret = rmw_read_wait_recover(rbio);
 	if (ret < 0)
 		return ret;
 
-	/* We have read errors, try recovery path. */
-	if (!bitmap_empty(rbio->error_bitmap, rbio->nr_sectors)) {
-		ret = recover_rbio(rbio);
-		if (ret < 0)
-			return ret;
-	}
 write:
 	/*
 	 * At this stage we're not allowed to add any new bios to the
-- 
cgit 


From 27137fac4c0628fc8320bb7f1ce3bb9f84b76a9b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 15 Nov 2022 10:44:04 +0100
Subject: btrfs: move struct btrfs_tree_parent_check out of disk-io.h

Move struct btrfs_tree_parent_check out of disk-io.h so that volumes.h
an various .c files don't have to include disk-io.h just for it.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Sterba <dsterba@suse.com>
[ use tree-checker.h for the structure ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c      |  1 +
 fs/btrfs/disk-io.h      | 30 +-----------------------------
 fs/btrfs/print-tree.c   |  1 +
 fs/btrfs/qgroup.c       |  1 +
 fs/btrfs/tree-checker.h | 35 +++++++++++++++++++++++++++++++++--
 fs/btrfs/tree-mod-log.c |  1 +
 fs/btrfs/volumes.h      |  2 +-
 7 files changed, 39 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 55c072ba6747..21c92c74bf71 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -19,6 +19,7 @@
 #include "accessors.h"
 #include "extent-tree.h"
 #include "relocation.h"
+#include "tree-checker.h"
 
 /* Just arbitrary numbers so we can be sure one of these happened. */
 #define BACKREF_FOUND_SHARED     6
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 03fe4154ffb8..363935cfc084 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -25,37 +25,9 @@ static inline u64 btrfs_sb_offset(int mirror)
 	return BTRFS_SUPER_INFO_OFFSET;
 }
 
-/* All the extra info needed to verify the parentness of a tree block. */
-struct btrfs_tree_parent_check {
-	/*
-	 * The owner check against the tree block.
-	 *
-	 * Can be 0 to skip the owner check.
-	 */
-	u64 owner_root;
-
-	/*
-	 * Expected transid, can be 0 to skip the check, but such skip
-	 * should only be utlized for backref walk related code.
-	 */
-	u64 transid;
-
-	/*
-	 * The expected first key.
-	 *
-	 * This check can be skipped if @has_first_key is false, such skip
-	 * can happen for case where we don't have the parent node key,
-	 * e.g. reading the tree root, doing backref walk.
-	 */
-	struct btrfs_key first_key;
-	bool has_first_key;
-
-	/* The expected level. Should always be set. */
-	u8 level;
-};
-
 struct btrfs_device;
 struct btrfs_fs_devices;
+struct btrfs_tree_parent_check;
 
 void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info);
 void btrfs_init_fs_info(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 1469aa55ad48..b93c96213304 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -8,6 +8,7 @@
 #include "disk-io.h"
 #include "print-tree.h"
 #include "accessors.h"
+#include "tree-checker.h"
 
 struct root_name_map {
 	u64 id;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index e0522c6c0d67..5c636e00d77d 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -28,6 +28,7 @@
 #include "accessors.h"
 #include "extent-tree.h"
 #include "root-tree.h"
+#include "tree-checker.h"
 
 /*
  * Helpers to access qgroup reservation
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index ece497e26558..bfb5efa4e01f 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -6,8 +6,39 @@
 #ifndef BTRFS_TREE_CHECKER_H
 #define BTRFS_TREE_CHECKER_H
 
-#include "ctree.h"
-#include "extent_io.h"
+#include <uapi/linux/btrfs_tree.h>
+
+struct extent_buffer;
+struct btrfs_chunk;
+
+/* All the extra info needed to verify the parentness of a tree block. */
+struct btrfs_tree_parent_check {
+	/*
+	 * The owner check against the tree block.
+	 *
+	 * Can be 0 to skip the owner check.
+	 */
+	u64 owner_root;
+
+	/*
+	 * Expected transid, can be 0 to skip the check, but such skip
+	 * should only be utlized for backref walk related code.
+	 */
+	u64 transid;
+
+	/*
+	 * The expected first key.
+	 *
+	 * This check can be skipped if @has_first_key is false, such skip
+	 * can happen for case where we don't have the parent node key,
+	 * e.g. reading the tree root, doing backref walk.
+	 */
+	struct btrfs_key first_key;
+	bool has_first_key;
+
+	/* The expected level. Should always be set. */
+	u8 level;
+};
 
 /*
  * Comprehensive leaf checker.
diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c
index 779ad44d285f..146a6b198933 100644
--- a/fs/btrfs/tree-mod-log.c
+++ b/fs/btrfs/tree-mod-log.c
@@ -5,6 +5,7 @@
 #include "disk-io.h"
 #include "fs.h"
 #include "accessors.h"
+#include "tree-checker.h"
 
 struct tree_mod_root {
 	u64 logical;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2c90e50c460a..ab551471c1f8 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -11,7 +11,7 @@
 #include <linux/btrfs.h>
 #include "async-thread.h"
 #include "messages.h"
-#include "disk-io.h"
+#include "tree-checker.h"
 #include "rcu-string.h"
 
 #define BTRFS_MAX_DATA_CHUNK_SIZE	(10ULL * SZ_1G)
-- 
cgit 


From 103c19723c80bf74e4e70cd6cde3b8783a27aceb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 15 Nov 2022 10:44:05 +0100
Subject: btrfs: split the bio submission path into a separate file

The code used by btrfs_submit_bio only interacts with the rest of
volumes.c through __btrfs_map_block (which itself is a more generic
version of two exported helpers) and does not really have anything
to do with volumes.c.  Create a new bio.c file and a bio.h header
going along with it for the btrfs_bio-based storage layer, which
will grow even more going forward.

Also update the file with my copyright notice given that a large
part of the moved code was written or rewritten by me.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/Makefile      |   2 +-
 fs/btrfs/bio.c         | 291 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/bio.h         | 127 +++++++++++++++++++++
 fs/btrfs/compression.c |   2 +-
 fs/btrfs/disk-io.c     |   2 +-
 fs/btrfs/extent-tree.c |   1 +
 fs/btrfs/extent_io.c   |   2 +-
 fs/btrfs/file-item.c   |   2 +-
 fs/btrfs/inode.c       |   2 +-
 fs/btrfs/relocation.c  |   1 +
 fs/btrfs/super.c       |   2 +-
 fs/btrfs/tree-log.c    |   1 +
 fs/btrfs/volumes.c     | 296 +------------------------------------------------
 fs/btrfs/volumes.h     | 110 +-----------------
 14 files changed, 438 insertions(+), 403 deletions(-)
 create mode 100644 fs/btrfs/bio.c
 create mode 100644 fs/btrfs/bio.h

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 84fb3b4c35b0..555c962fdad6 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -31,7 +31,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
 	   uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
 	   block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
-	   subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o
+	   subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
new file mode 100644
index 000000000000..9e881dc91bae
--- /dev/null
+++ b/fs/btrfs/bio.c
@@ -0,0 +1,291 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ * Copyright (C) 2022 Christoph Hellwig.
+ */
+
+#include <linux/bio.h>
+#include "bio.h"
+#include "ctree.h"
+#include "volumes.h"
+#include "raid56.h"
+#include "async-thread.h"
+#include "check-integrity.h"
+#include "dev-replace.h"
+#include "rcu-string.h"
+#include "zoned.h"
+
+static struct bio_set btrfs_bioset;
+
+/*
+ * Initialize a btrfs_bio structure.  This skips the embedded bio itself as it
+ * is already initialized by the block layer.
+ */
+static inline void btrfs_bio_init(struct btrfs_bio *bbio,
+				  btrfs_bio_end_io_t end_io, void *private)
+{
+	memset(bbio, 0, offsetof(struct btrfs_bio, bio));
+	bbio->end_io = end_io;
+	bbio->private = private;
+}
+
+/*
+ * Allocate a btrfs_bio structure.  The btrfs_bio is the main I/O container for
+ * btrfs, and is used for all I/O submitted through btrfs_submit_bio.
+ *
+ * Just like the underlying bio_alloc_bioset it will not fail as it is backed by
+ * a mempool.
+ */
+struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
+			    btrfs_bio_end_io_t end_io, void *private)
+{
+	struct bio *bio;
+
+	bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset);
+	btrfs_bio_init(btrfs_bio(bio), end_io, private);
+	return bio;
+}
+
+struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size,
+				    btrfs_bio_end_io_t end_io, void *private)
+{
+	struct bio *bio;
+	struct btrfs_bio *bbio;
+
+	ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
+
+	bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset);
+	bbio = btrfs_bio(bio);
+	btrfs_bio_init(bbio, end_io, private);
+
+	bio_trim(bio, offset >> 9, size >> 9);
+	bbio->iter = bio->bi_iter;
+	return bio;
+}
+
+static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev)
+{
+	if (!dev || !dev->bdev)
+		return;
+	if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET)
+		return;
+
+	if (btrfs_op(bio) == BTRFS_MAP_WRITE)
+		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
+	if (!(bio->bi_opf & REQ_RAHEAD))
+		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
+	if (bio->bi_opf & REQ_PREFLUSH)
+		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS);
+}
+
+static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info,
+						struct bio *bio)
+{
+	if (bio->bi_opf & REQ_META)
+		return fs_info->endio_meta_workers;
+	return fs_info->endio_workers;
+}
+
+static void btrfs_end_bio_work(struct work_struct *work)
+{
+	struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
+
+	bbio->end_io(bbio);
+}
+
+static void btrfs_simple_end_io(struct bio *bio)
+{
+	struct btrfs_fs_info *fs_info = bio->bi_private;
+	struct btrfs_bio *bbio = btrfs_bio(bio);
+
+	btrfs_bio_counter_dec(fs_info);
+
+	if (bio->bi_status)
+		btrfs_log_dev_io_error(bio, bbio->device);
+
+	if (bio_op(bio) == REQ_OP_READ) {
+		INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
+		queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work);
+	} else {
+		bbio->end_io(bbio);
+	}
+}
+
+static void btrfs_raid56_end_io(struct bio *bio)
+{
+	struct btrfs_io_context *bioc = bio->bi_private;
+	struct btrfs_bio *bbio = btrfs_bio(bio);
+
+	btrfs_bio_counter_dec(bioc->fs_info);
+	bbio->mirror_num = bioc->mirror_num;
+	bbio->end_io(bbio);
+
+	btrfs_put_bioc(bioc);
+}
+
+static void btrfs_orig_write_end_io(struct bio *bio)
+{
+	struct btrfs_io_stripe *stripe = bio->bi_private;
+	struct btrfs_io_context *bioc = stripe->bioc;
+	struct btrfs_bio *bbio = btrfs_bio(bio);
+
+	btrfs_bio_counter_dec(bioc->fs_info);
+
+	if (bio->bi_status) {
+		atomic_inc(&bioc->error);
+		btrfs_log_dev_io_error(bio, stripe->dev);
+	}
+
+	/*
+	 * Only send an error to the higher layers if it is beyond the tolerance
+	 * threshold.
+	 */
+	if (atomic_read(&bioc->error) > bioc->max_errors)
+		bio->bi_status = BLK_STS_IOERR;
+	else
+		bio->bi_status = BLK_STS_OK;
+
+	bbio->end_io(bbio);
+	btrfs_put_bioc(bioc);
+}
+
+static void btrfs_clone_write_end_io(struct bio *bio)
+{
+	struct btrfs_io_stripe *stripe = bio->bi_private;
+
+	if (bio->bi_status) {
+		atomic_inc(&stripe->bioc->error);
+		btrfs_log_dev_io_error(bio, stripe->dev);
+	}
+
+	/* Pass on control to the original bio this one was cloned from */
+	bio_endio(stripe->bioc->orig_bio);
+	bio_put(bio);
+}
+
+static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
+{
+	if (!dev || !dev->bdev ||
+	    test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
+	    (btrfs_op(bio) == BTRFS_MAP_WRITE &&
+	     !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
+		bio_io_error(bio);
+		return;
+	}
+
+	bio_set_dev(bio, dev->bdev);
+
+	/*
+	 * For zone append writing, bi_sector must point the beginning of the
+	 * zone
+	 */
+	if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+		u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+
+		if (btrfs_dev_is_sequential(dev, physical)) {
+			u64 zone_start = round_down(physical,
+						    dev->fs_info->zone_size);
+
+			bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
+		} else {
+			bio->bi_opf &= ~REQ_OP_ZONE_APPEND;
+			bio->bi_opf |= REQ_OP_WRITE;
+		}
+	}
+	btrfs_debug_in_rcu(dev->fs_info,
+	"%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
+		__func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
+		(unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev),
+		dev->devid, bio->bi_iter.bi_size);
+
+	btrfsic_check_bio(bio);
+	submit_bio(bio);
+}
+
+static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
+{
+	struct bio *orig_bio = bioc->orig_bio, *bio;
+
+	ASSERT(bio_op(orig_bio) != REQ_OP_READ);
+
+	/* Reuse the bio embedded into the btrfs_bio for the last mirror */
+	if (dev_nr == bioc->num_stripes - 1) {
+		bio = orig_bio;
+		bio->bi_end_io = btrfs_orig_write_end_io;
+	} else {
+		bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set);
+		bio_inc_remaining(orig_bio);
+		bio->bi_end_io = btrfs_clone_write_end_io;
+	}
+
+	bio->bi_private = &bioc->stripes[dev_nr];
+	bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT;
+	bioc->stripes[dev_nr].bioc = bioc;
+	btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
+}
+
+void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num)
+{
+	u64 logical = bio->bi_iter.bi_sector << 9;
+	u64 length = bio->bi_iter.bi_size;
+	u64 map_length = length;
+	struct btrfs_io_context *bioc = NULL;
+	struct btrfs_io_stripe smap;
+	int ret;
+
+	btrfs_bio_counter_inc_blocked(fs_info);
+	ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
+				&bioc, &smap, &mirror_num, 1);
+	if (ret) {
+		btrfs_bio_counter_dec(fs_info);
+		btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
+		return;
+	}
+
+	if (map_length < length) {
+		btrfs_crit(fs_info,
+			   "mapping failed logical %llu bio len %llu len %llu",
+			   logical, length, map_length);
+		BUG();
+	}
+
+	if (!bioc) {
+		/* Single mirror read/write fast path */
+		btrfs_bio(bio)->mirror_num = mirror_num;
+		btrfs_bio(bio)->device = smap.dev;
+		bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
+		bio->bi_private = fs_info;
+		bio->bi_end_io = btrfs_simple_end_io;
+		btrfs_submit_dev_bio(smap.dev, bio);
+	} else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+		/* Parity RAID write or read recovery */
+		bio->bi_private = bioc;
+		bio->bi_end_io = btrfs_raid56_end_io;
+		if (bio_op(bio) == REQ_OP_READ)
+			raid56_parity_recover(bio, bioc, mirror_num);
+		else
+			raid56_parity_write(bio, bioc);
+	} else {
+		/* Write to multiple mirrors */
+		int total_devs = bioc->num_stripes;
+		int dev_nr;
+
+		bioc->orig_bio = bio;
+		for (dev_nr = 0; dev_nr < total_devs; dev_nr++)
+			btrfs_submit_mirrored_bio(bioc, dev_nr);
+	}
+}
+
+int __init btrfs_bioset_init(void)
+{
+	if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
+			offsetof(struct btrfs_bio, bio),
+			BIOSET_NEED_BVECS))
+		return -ENOMEM;
+	return 0;
+}
+
+void __cold btrfs_bioset_exit(void)
+{
+	bioset_exit(&btrfs_bioset);
+}
diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
new file mode 100644
index 000000000000..b12f84b3b341
--- /dev/null
+++ b/fs/btrfs/bio.h
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ * Copyright (C) 2022 Christoph Hellwig.
+ */
+
+#ifndef BTRFS_BIO_H
+#define BTRFS_BIO_H
+
+#include <linux/bio.h>
+#include <linux/workqueue.h>
+#include "tree-checker.h"
+
+struct btrfs_bio;
+struct btrfs_fs_info;
+
+#define BTRFS_BIO_INLINE_CSUM_SIZE	64
+
+/*
+ * Maximum number of sectors for a single bio to limit the size of the
+ * checksum array.  This matches the number of bio_vecs per bio and thus the
+ * I/O size for buffered I/O.
+ */
+#define BTRFS_MAX_BIO_SECTORS		(256)
+
+typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio);
+
+/*
+ * Additional info to pass along bio.
+ *
+ * Mostly for btrfs specific features like csum and mirror_num.
+ */
+struct btrfs_bio {
+	unsigned int mirror_num:7;
+
+	/*
+	 * Extra indicator for metadata bios.
+	 * For some btrfs bios they use pages without a mapping, thus
+	 * we can not rely on page->mapping->host to determine if
+	 * it's a metadata bio.
+	 */
+	unsigned int is_metadata:1;
+	struct bvec_iter iter;
+
+	/* for direct I/O */
+	u64 file_offset;
+
+	/* @device is for stripe IO submission. */
+	struct btrfs_device *device;
+	union {
+		/* For data checksum verification. */
+		struct {
+			u8 *csum;
+			u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
+		};
+
+		/* For metadata parentness verification. */
+		struct btrfs_tree_parent_check parent_check;
+	};
+
+	/* End I/O information supplied to btrfs_bio_alloc */
+	btrfs_bio_end_io_t end_io;
+	void *private;
+
+	/* For read end I/O handling */
+	struct work_struct end_io_work;
+
+	/*
+	 * This member must come last, bio_alloc_bioset will allocate enough
+	 * bytes for entire btrfs_bio but relies on bio being last.
+	 */
+	struct bio bio;
+};
+
+static inline struct btrfs_bio *btrfs_bio(struct bio *bio)
+{
+	return container_of(bio, struct btrfs_bio, bio);
+}
+
+int __init btrfs_bioset_init(void);
+void __cold btrfs_bioset_exit(void);
+
+struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
+			    btrfs_bio_end_io_t end_io, void *private);
+struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size,
+				    btrfs_bio_end_io_t end_io, void *private);
+
+
+static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
+{
+	bbio->bio.bi_status = status;
+	bbio->end_io(bbio);
+}
+
+static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio)
+{
+	if (bbio->is_metadata)
+		return;
+	if (bbio->csum != bbio->csum_inline) {
+		kfree(bbio->csum);
+		bbio->csum = NULL;
+	}
+}
+
+/*
+ * Iterate through a btrfs_bio (@bbio) on a per-sector basis.
+ *
+ * bvl        - struct bio_vec
+ * bbio       - struct btrfs_bio
+ * iters      - struct bvec_iter
+ * bio_offset - unsigned int
+ */
+#define btrfs_bio_for_each_sector(fs_info, bvl, bbio, iter, bio_offset)	\
+	for ((iter) = (bbio)->iter, (bio_offset) = 0;			\
+	     (iter).bi_size &&					\
+	     (((bvl) = bio_iter_iovec((&(bbio)->bio), (iter))), 1);	\
+	     (bio_offset) += fs_info->sectorsize,			\
+	     bio_advance_iter_single(&(bbio)->bio, &(iter),		\
+	     (fs_info)->sectorsize))
+
+void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
+		      int mirror_num);
+int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
+			    u64 length, u64 logical, struct page *page,
+			    unsigned int pg_offset, int mirror_num);
+
+#endif
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 30adf430241e..5122ca79f7ea 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -27,7 +27,7 @@
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
-#include "volumes.h"
+#include "bio.h"
 #include "ordered-data.h"
 #include "compression.h"
 #include "extent_io.h"
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 91a088210e5a..d5be259845d1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -23,7 +23,7 @@
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
-#include "volumes.h"
+#include "bio.h"
 #include "print-tree.h"
 #include "locking.h"
 #include "tree-log.h"
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 17f599027c3d..892d78c1853c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -42,6 +42,7 @@
 #include "root-tree.h"
 #include "file-item.h"
 #include "orphan.h"
+#include "tree-checker.h"
 
 #undef SCRAMBLE_DELAYED_REFS
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 65ba5c3658cf..95d54b5834c0 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -20,7 +20,7 @@
 #include "extent_map.h"
 #include "ctree.h"
 #include "btrfs_inode.h"
-#include "volumes.h"
+#include "bio.h"
 #include "check-integrity.h"
 #include "locking.h"
 #include "rcu-string.h"
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 352bbb33b76f..5de73466b2ca 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -14,7 +14,7 @@
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
-#include "volumes.h"
+#include "bio.h"
 #include "print-tree.h"
 #include "compression.h"
 #include "fs.h"
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4248e6cabbdc..905ea19df125 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -43,7 +43,7 @@
 #include "ordered-data.h"
 #include "xattr.h"
 #include "tree-log.h"
-#include "volumes.h"
+#include "bio.h"
 #include "compression.h"
 #include "locking.h"
 #include "free-space-cache.h"
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index aa80e51bc8ca..31ec4a7658ce 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -35,6 +35,7 @@
 #include "file-item.h"
 #include "relocation.h"
 #include "super.h"
+#include "tree-checker.h"
 
 /*
  * Relocation overview
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index ea83dd9f735a..93f52ee85f6f 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -35,7 +35,7 @@
 #include "print-tree.h"
 #include "props.h"
 #include "xattr.h"
-#include "volumes.h"
+#include "bio.h"
 #include "export.h"
 #include "compression.h"
 #include "rcu-string.h"
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 8f8d7e7dc3a8..4387cd2ade97 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -29,6 +29,7 @@
 #include "file-item.h"
 #include "file.h"
 #include "orphan.h"
+#include "tree-checker.h"
 
 #define MAX_CONFLICT_INODES 10
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e51fd5f1042a..acab20f2863d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5,12 +5,9 @@
 
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
-#include <linux/bio.h>
 #include <linux/slab.h>
-#include <linux/blkdev.h>
 #include <linux/ratelimit.h>
 #include <linux/kthread.h>
-#include <linux/raid/pq.h>
 #include <linux/semaphore.h>
 #include <linux/uuid.h>
 #include <linux/list_sort.h>
@@ -23,8 +20,6 @@
 #include "print-tree.h"
 #include "volumes.h"
 #include "raid56.h"
-#include "async-thread.h"
-#include "check-integrity.h"
 #include "rcu-string.h"
 #include "dev-replace.h"
 #include "sysfs.h"
@@ -41,8 +36,6 @@
 #include "scrub.h"
 #include "super.h"
 
-static struct bio_set btrfs_bioset;
-
 #define BTRFS_BLOCK_GROUP_STRIPE_MASK	(BTRFS_BLOCK_GROUP_RAID0 | \
 					 BTRFS_BLOCK_GROUP_RAID10 | \
 					 BTRFS_BLOCK_GROUP_RAID56_MASK)
@@ -255,11 +248,6 @@ out_overflow:;
 static int init_first_rw_device(struct btrfs_trans_handle *trans);
 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
-static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
-			     enum btrfs_map_op op, u64 logical, u64 *length,
-			     struct btrfs_io_context **bioc_ret,
-			     struct btrfs_io_stripe *smap,
-			     int *mirror_num_ret, int need_raid_map);
 
 /*
  * Device locking
@@ -6364,11 +6352,11 @@ static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *
 			stripe_offset + stripe_nr * map->stripe_len;
 }
 
-static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
-			     enum btrfs_map_op op, u64 logical, u64 *length,
-			     struct btrfs_io_context **bioc_ret,
-			     struct btrfs_io_stripe *smap,
-			     int *mirror_num_ret, int need_raid_map)
+int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+		      u64 logical, u64 *length,
+		      struct btrfs_io_context **bioc_ret,
+		      struct btrfs_io_stripe *smap, int *mirror_num_ret,
+		      int need_raid_map)
 {
 	struct extent_map *em;
 	struct map_lookup *map;
@@ -6651,266 +6639,6 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 				 NULL, NULL, 1);
 }
 
-/*
- * Initialize a btrfs_bio structure.  This skips the embedded bio itself as it
- * is already initialized by the block layer.
- */
-static inline void btrfs_bio_init(struct btrfs_bio *bbio,
-				  btrfs_bio_end_io_t end_io, void *private)
-{
-	memset(bbio, 0, offsetof(struct btrfs_bio, bio));
-	bbio->end_io = end_io;
-	bbio->private = private;
-}
-
-/*
- * Allocate a btrfs_bio structure.  The btrfs_bio is the main I/O container for
- * btrfs, and is used for all I/O submitted through btrfs_submit_bio.
- *
- * Just like the underlying bio_alloc_bioset it will not fail as it is backed by
- * a mempool.
- */
-struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
-			    btrfs_bio_end_io_t end_io, void *private)
-{
-	struct bio *bio;
-
-	bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset);
-	btrfs_bio_init(btrfs_bio(bio), end_io, private);
-	return bio;
-}
-
-struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size,
-				    btrfs_bio_end_io_t end_io, void *private)
-{
-	struct bio *bio;
-	struct btrfs_bio *bbio;
-
-	ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
-
-	bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset);
-	bbio = btrfs_bio(bio);
-	btrfs_bio_init(bbio, end_io, private);
-
-	bio_trim(bio, offset >> 9, size >> 9);
-	bbio->iter = bio->bi_iter;
-	return bio;
-}
-
-static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev)
-{
-	if (!dev || !dev->bdev)
-		return;
-	if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET)
-		return;
-
-	if (btrfs_op(bio) == BTRFS_MAP_WRITE)
-		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
-	if (!(bio->bi_opf & REQ_RAHEAD))
-		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
-	if (bio->bi_opf & REQ_PREFLUSH)
-		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS);
-}
-
-static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info,
-						struct bio *bio)
-{
-	if (bio->bi_opf & REQ_META)
-		return fs_info->endio_meta_workers;
-	return fs_info->endio_workers;
-}
-
-static void btrfs_end_bio_work(struct work_struct *work)
-{
-	struct btrfs_bio *bbio =
-		container_of(work, struct btrfs_bio, end_io_work);
-
-	bbio->end_io(bbio);
-}
-
-static void btrfs_simple_end_io(struct bio *bio)
-{
-	struct btrfs_fs_info *fs_info = bio->bi_private;
-	struct btrfs_bio *bbio = btrfs_bio(bio);
-
-	btrfs_bio_counter_dec(fs_info);
-
-	if (bio->bi_status)
-		btrfs_log_dev_io_error(bio, bbio->device);
-
-	if (bio_op(bio) == REQ_OP_READ) {
-		INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
-		queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work);
-	} else {
-		bbio->end_io(bbio);
-	}
-}
-
-static void btrfs_raid56_end_io(struct bio *bio)
-{
-	struct btrfs_io_context *bioc = bio->bi_private;
-	struct btrfs_bio *bbio = btrfs_bio(bio);
-
-	btrfs_bio_counter_dec(bioc->fs_info);
-	bbio->mirror_num = bioc->mirror_num;
-	bbio->end_io(bbio);
-
-	btrfs_put_bioc(bioc);
-}
-
-static void btrfs_orig_write_end_io(struct bio *bio)
-{
-	struct btrfs_io_stripe *stripe = bio->bi_private;
-	struct btrfs_io_context *bioc = stripe->bioc;
-	struct btrfs_bio *bbio = btrfs_bio(bio);
-
-	btrfs_bio_counter_dec(bioc->fs_info);
-
-	if (bio->bi_status) {
-		atomic_inc(&bioc->error);
-		btrfs_log_dev_io_error(bio, stripe->dev);
-	}
-
-	/*
-	 * Only send an error to the higher layers if it is beyond the tolerance
-	 * threshold.
-	 */
-	if (atomic_read(&bioc->error) > bioc->max_errors)
-		bio->bi_status = BLK_STS_IOERR;
-	else
-		bio->bi_status = BLK_STS_OK;
-
-	bbio->end_io(bbio);
-	btrfs_put_bioc(bioc);
-}
-
-static void btrfs_clone_write_end_io(struct bio *bio)
-{
-	struct btrfs_io_stripe *stripe = bio->bi_private;
-
-	if (bio->bi_status) {
-		atomic_inc(&stripe->bioc->error);
-		btrfs_log_dev_io_error(bio, stripe->dev);
-	}
-
-	/* Pass on control to the original bio this one was cloned from */
-	bio_endio(stripe->bioc->orig_bio);
-	bio_put(bio);
-}
-
-static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
-{
-	if (!dev || !dev->bdev ||
-	    test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
-	    (btrfs_op(bio) == BTRFS_MAP_WRITE &&
-	     !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
-		bio_io_error(bio);
-		return;
-	}
-
-	bio_set_dev(bio, dev->bdev);
-
-	/*
-	 * For zone append writing, bi_sector must point the beginning of the
-	 * zone
-	 */
-	if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
-		u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
-
-		if (btrfs_dev_is_sequential(dev, physical)) {
-			u64 zone_start = round_down(physical,
-						    dev->fs_info->zone_size);
-
-			bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
-		} else {
-			bio->bi_opf &= ~REQ_OP_ZONE_APPEND;
-			bio->bi_opf |= REQ_OP_WRITE;
-		}
-	}
-	btrfs_debug_in_rcu(dev->fs_info,
-	"%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
-		__func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
-		(unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev),
-		dev->devid, bio->bi_iter.bi_size);
-
-	btrfsic_check_bio(bio);
-	submit_bio(bio);
-}
-
-static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
-{
-	struct bio *orig_bio = bioc->orig_bio, *bio;
-
-	ASSERT(bio_op(orig_bio) != REQ_OP_READ);
-
-	/* Reuse the bio embedded into the btrfs_bio for the last mirror */
-	if (dev_nr == bioc->num_stripes - 1) {
-		bio = orig_bio;
-		bio->bi_end_io = btrfs_orig_write_end_io;
-	} else {
-		bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set);
-		bio_inc_remaining(orig_bio);
-		bio->bi_end_io = btrfs_clone_write_end_io;
-	}
-
-	bio->bi_private = &bioc->stripes[dev_nr];
-	bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT;
-	bioc->stripes[dev_nr].bioc = bioc;
-	btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
-}
-
-void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num)
-{
-	u64 logical = bio->bi_iter.bi_sector << 9;
-	u64 length = bio->bi_iter.bi_size;
-	u64 map_length = length;
-	struct btrfs_io_context *bioc = NULL;
-	struct btrfs_io_stripe smap;
-	int ret;
-
-	btrfs_bio_counter_inc_blocked(fs_info);
-	ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
-				&bioc, &smap, &mirror_num, 1);
-	if (ret) {
-		btrfs_bio_counter_dec(fs_info);
-		btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
-		return;
-	}
-
-	if (map_length < length) {
-		btrfs_crit(fs_info,
-			   "mapping failed logical %llu bio len %llu len %llu",
-			   logical, length, map_length);
-		BUG();
-	}
-
-	if (!bioc) {
-		/* Single mirror read/write fast path */
-		btrfs_bio(bio)->mirror_num = mirror_num;
-		btrfs_bio(bio)->device = smap.dev;
-		bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
-		bio->bi_private = fs_info;
-		bio->bi_end_io = btrfs_simple_end_io;
-		btrfs_submit_dev_bio(smap.dev, bio);
-	} else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
-		/* Parity RAID write or read recovery */
-		bio->bi_private = bioc;
-		bio->bi_end_io = btrfs_raid56_end_io;
-		if (bio_op(bio) == REQ_OP_READ)
-			raid56_parity_recover(bio, bioc, mirror_num);
-		else
-			raid56_parity_write(bio, bioc);
-	} else {
-		/* Write to multiple mirrors */
-		int total_devs = bioc->num_stripes;
-		int dev_nr;
-
-		bioc->orig_bio = bio;
-		for (dev_nr = 0; dev_nr < total_devs; dev_nr++)
-			btrfs_submit_mirrored_bio(bioc, dev_nr);
-	}
-}
-
 static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
 				      const struct btrfs_fs_devices *fs_devices)
 {
@@ -8440,17 +8168,3 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
 
 	return true;
 }
-
-int __init btrfs_bioset_init(void)
-{
-	if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
-			offsetof(struct btrfs_bio, bio),
-			BIOSET_NEED_BVECS))
-		return -ENOMEM;
-	return 0;
-}
-
-void __cold btrfs_bioset_exit(void)
-{
-	bioset_exit(&btrfs_bioset);
-}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index ab551471c1f8..6b7a05f6cf82 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -6,7 +6,6 @@
 #ifndef BTRFS_VOLUMES_H
 #define BTRFS_VOLUMES_H
 
-#include <linux/bio.h>
 #include <linux/sort.h>
 #include <linux/btrfs.h>
 #include "async-thread.h"
@@ -373,8 +372,6 @@ struct btrfs_fs_devices {
 	enum btrfs_read_policy read_policy;
 };
 
-#define BTRFS_BIO_INLINE_CSUM_SIZE	64
-
 #define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info)	\
 			- sizeof(struct btrfs_chunk))		\
 			/ sizeof(struct btrfs_stripe) + 1)
@@ -384,107 +381,6 @@ struct btrfs_fs_devices {
 				- 2 * sizeof(struct btrfs_chunk))	\
 				/ sizeof(struct btrfs_stripe) + 1)
 
-/*
- * Maximum number of sectors for a single bio to limit the size of the
- * checksum array.  This matches the number of bio_vecs per bio and thus the
- * I/O size for buffered I/O.
- */
-#define BTRFS_MAX_BIO_SECTORS				(256)
-
-typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio);
-
-/*
- * Additional info to pass along bio.
- *
- * Mostly for btrfs specific features like csum and mirror_num.
- */
-struct btrfs_bio {
-	unsigned int mirror_num:7;
-
-	/*
-	 * Extra indicator for metadata bios.
-	 * For some btrfs bios they use pages without a mapping, thus
-	 * we can not rely on page->mapping->host to determine if
-	 * it's a metadata bio.
-	 */
-	unsigned int is_metadata:1;
-	struct bvec_iter iter;
-
-	/* for direct I/O */
-	u64 file_offset;
-
-	/* @device is for stripe IO submission. */
-	struct btrfs_device *device;
-	union {
-		/* For data checksum verification. */
-		struct {
-			u8 *csum;
-			u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
-		};
-
-		/* For metadata parentness verification. */
-		struct btrfs_tree_parent_check parent_check;
-	};
-
-	/* End I/O information supplied to btrfs_bio_alloc */
-	btrfs_bio_end_io_t end_io;
-	void *private;
-
-	/* For read end I/O handling */
-	struct work_struct end_io_work;
-
-	/*
-	 * This member must come last, bio_alloc_bioset will allocate enough
-	 * bytes for entire btrfs_bio but relies on bio being last.
-	 */
-	struct bio bio;
-};
-
-static inline struct btrfs_bio *btrfs_bio(struct bio *bio)
-{
-	return container_of(bio, struct btrfs_bio, bio);
-}
-
-int __init btrfs_bioset_init(void);
-void __cold btrfs_bioset_exit(void);
-
-struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
-			    btrfs_bio_end_io_t end_io, void *private);
-struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size,
-				    btrfs_bio_end_io_t end_io, void *private);
-
-static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
-{
-	bbio->bio.bi_status = status;
-	bbio->end_io(bbio);
-}
-
-static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio)
-{
-	if (bbio->is_metadata)
-		return;
-	if (bbio->csum != bbio->csum_inline) {
-		kfree(bbio->csum);
-		bbio->csum = NULL;
-	}
-}
-
-/*
- * Iterate through a btrfs_bio (@bbio) on a per-sector basis.
- *
- * bvl        - struct bio_vec
- * bbio       - struct btrfs_bio
- * iters      - struct bvec_iter
- * bio_offset - unsigned int
- */
-#define btrfs_bio_for_each_sector(fs_info, bvl, bbio, iter, bio_offset)	\
-	for ((iter) = (bbio)->iter, (bio_offset) = 0;			\
-	     (iter).bi_size &&					\
-	     (((bvl) = bio_iter_iovec((&(bbio)->bio), (iter))), 1);	\
-	     (bio_offset) += fs_info->sectorsize,			\
-	     bio_advance_iter_single(&(bbio)->bio, &(iter),		\
-	     (fs_info)->sectorsize))
-
 struct btrfs_io_stripe {
 	struct btrfs_device *dev;
 	union {
@@ -641,6 +537,11 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 		     u64 logical, u64 *length,
 		     struct btrfs_io_context **bioc_ret);
+int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+		      u64 logical, u64 *length,
+		      struct btrfs_io_context **bioc_ret,
+		      struct btrfs_io_stripe *smap, int *mirror_num_ret,
+		      int need_raid_map);
 struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
 					       u64 logical, u64 *length_ret,
 					       u32 *num_stripes);
@@ -652,7 +553,6 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
 struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
 					    u64 type);
 void btrfs_mapping_tree_free(struct extent_map_tree *tree);
-void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num);
 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		       fmode_t flags, void *holder);
 struct btrfs_device *btrfs_scan_one_device(const char *path,
-- 
cgit 


From bacf60e515862904dd91a332f3a54f092416eaa6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 15 Nov 2022 10:44:06 +0100
Subject: btrfs: move repair_io_failure to bio.c

repair_io_failure ties directly into all the glory low-level details of
mapping a bio with a logic address to the actual physical location.
Move it right below btrfs_submit_bio to keep all the related logic
together.

Also move btrfs_repair_eb_io_failure to its caller in disk-io.c now that
repair_io_failure is available in a header.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/bio.c       |  90 +++++++++++++++++++++++++++++++++++++++
 fs/btrfs/disk-io.c   |  24 +++++++++++
 fs/btrfs/extent_io.c | 117 +--------------------------------------------------
 fs/btrfs/extent_io.h |   1 -
 4 files changed, 116 insertions(+), 116 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 9e881dc91bae..b8fb7ef6b520 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -276,6 +276,96 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror
 	}
 }
 
+/*
+ * Submit a repair write.
+ *
+ * This bypasses btrfs_submit_bio deliberately, as that writes all copies in a
+ * RAID setup.  Here we only want to write the one bad copy, so we do the
+ * mapping ourselves and submit the bio directly.
+ *
+ * The I/O is issued sychronously to block the repair read completion from
+ * freeing the bio.
+ */
+int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
+			    u64 length, u64 logical, struct page *page,
+			    unsigned int pg_offset, int mirror_num)
+{
+	struct btrfs_device *dev;
+	struct bio_vec bvec;
+	struct bio bio;
+	u64 map_length = 0;
+	u64 sector;
+	struct btrfs_io_context *bioc = NULL;
+	int ret = 0;
+
+	ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
+	BUG_ON(!mirror_num);
+
+	if (btrfs_repair_one_zone(fs_info, logical))
+		return 0;
+
+	map_length = length;
+
+	/*
+	 * Avoid races with device replace and make sure our bioc has devices
+	 * associated to its stripes that don't go away while we are doing the
+	 * read repair operation.
+	 */
+	btrfs_bio_counter_inc_blocked(fs_info);
+	if (btrfs_is_parity_mirror(fs_info, logical, length)) {
+		/*
+		 * Note that we don't use BTRFS_MAP_WRITE because it's supposed
+		 * to update all raid stripes, but here we just want to correct
+		 * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
+		 * stripe's dev and sector.
+		 */
+		ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
+				      &map_length, &bioc, 0);
+		if (ret)
+			goto out_counter_dec;
+		ASSERT(bioc->mirror_num == 1);
+	} else {
+		ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
+				      &map_length, &bioc, mirror_num);
+		if (ret)
+			goto out_counter_dec;
+		BUG_ON(mirror_num != bioc->mirror_num);
+	}
+
+	sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9;
+	dev = bioc->stripes[bioc->mirror_num - 1].dev;
+	btrfs_put_bioc(bioc);
+
+	if (!dev || !dev->bdev ||
+	    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
+		ret = -EIO;
+		goto out_counter_dec;
+	}
+
+	bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
+	bio.bi_iter.bi_sector = sector;
+	__bio_add_page(&bio, page, length, pg_offset);
+
+	btrfsic_check_bio(&bio);
+	ret = submit_bio_wait(&bio);
+	if (ret) {
+		/* try to remap that extent elsewhere? */
+		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
+		goto out_bio_uninit;
+	}
+
+	btrfs_info_rl_in_rcu(fs_info,
+		"read error corrected: ino %llu off %llu (dev %s sector %llu)",
+			     ino, start, btrfs_dev_name(dev), sector);
+	ret = 0;
+
+out_bio_uninit:
+	bio_uninit(&bio);
+out_counter_dec:
+	btrfs_bio_counter_dec(fs_info);
+	return ret;
+}
+
 int __init btrfs_bioset_init(void)
 {
 	if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d5be259845d1..0888d484df80 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -255,6 +255,30 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level,
 	return ret;
 }
 
+static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
+				      int mirror_num)
+{
+	struct btrfs_fs_info *fs_info = eb->fs_info;
+	u64 start = eb->start;
+	int i, num_pages = num_extent_pages(eb);
+	int ret = 0;
+
+	if (sb_rdonly(fs_info->sb))
+		return -EROFS;
+
+	for (i = 0; i < num_pages; i++) {
+		struct page *p = eb->pages[i];
+
+		ret = btrfs_repair_io_failure(fs_info, 0, start, PAGE_SIZE,
+				start, p, start - page_offset(p), mirror_num);
+		if (ret)
+			break;
+		start += PAGE_SIZE;
+	}
+
+	return ret;
+}
+
 /*
  * helper to read a given tree block, doing retries as required when
  * the checksums don't match and we have alternate mirrors to try.
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 95d54b5834c0..8528e7d3f38f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -531,119 +531,6 @@ static void free_io_failure(struct btrfs_inode *inode,
 	kfree(rec);
 }
 
-/*
- * this bypasses the standard btrfs submit functions deliberately, as
- * the standard behavior is to write all copies in a raid setup. here we only
- * want to write the one bad copy. so we do the mapping for ourselves and issue
- * submit_bio directly.
- * to avoid any synchronization issues, wait for the data after writing, which
- * actually prevents the read that triggered the error from finishing.
- * currently, there can be no more than two copies of every data bit. thus,
- * exactly one rewrite is required.
- */
-static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
-			     u64 length, u64 logical, struct page *page,
-			     unsigned int pg_offset, int mirror_num)
-{
-	struct btrfs_device *dev;
-	struct bio_vec bvec;
-	struct bio bio;
-	u64 map_length = 0;
-	u64 sector;
-	struct btrfs_io_context *bioc = NULL;
-	int ret = 0;
-
-	ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
-	BUG_ON(!mirror_num);
-
-	if (btrfs_repair_one_zone(fs_info, logical))
-		return 0;
-
-	map_length = length;
-
-	/*
-	 * Avoid races with device replace and make sure our bioc has devices
-	 * associated to its stripes that don't go away while we are doing the
-	 * read repair operation.
-	 */
-	btrfs_bio_counter_inc_blocked(fs_info);
-	if (btrfs_is_parity_mirror(fs_info, logical, length)) {
-		/*
-		 * Note that we don't use BTRFS_MAP_WRITE because it's supposed
-		 * to update all raid stripes, but here we just want to correct
-		 * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
-		 * stripe's dev and sector.
-		 */
-		ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
-				      &map_length, &bioc, 0);
-		if (ret)
-			goto out_counter_dec;
-		ASSERT(bioc->mirror_num == 1);
-	} else {
-		ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
-				      &map_length, &bioc, mirror_num);
-		if (ret)
-			goto out_counter_dec;
-		BUG_ON(mirror_num != bioc->mirror_num);
-	}
-
-	sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9;
-	dev = bioc->stripes[bioc->mirror_num - 1].dev;
-	btrfs_put_bioc(bioc);
-
-	if (!dev || !dev->bdev ||
-	    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
-		ret = -EIO;
-		goto out_counter_dec;
-	}
-
-	bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
-	bio.bi_iter.bi_sector = sector;
-	__bio_add_page(&bio, page, length, pg_offset);
-
-	btrfsic_check_bio(&bio);
-	ret = submit_bio_wait(&bio);
-	if (ret) {
-		/* try to remap that extent elsewhere? */
-		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
-		goto out_bio_uninit;
-	}
-
-	btrfs_info_rl_in_rcu(fs_info,
-		"read error corrected: ino %llu off %llu (dev %s sector %llu)",
-			     ino, start, btrfs_dev_name(dev), sector);
-	ret = 0;
-
-out_bio_uninit:
-	bio_uninit(&bio);
-out_counter_dec:
-	btrfs_bio_counter_dec(fs_info);
-	return ret;
-}
-
-int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num)
-{
-	struct btrfs_fs_info *fs_info = eb->fs_info;
-	u64 start = eb->start;
-	int i, num_pages = num_extent_pages(eb);
-	int ret = 0;
-
-	if (sb_rdonly(fs_info->sb))
-		return -EROFS;
-
-	for (i = 0; i < num_pages; i++) {
-		struct page *p = eb->pages[i];
-
-		ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p,
-					start - page_offset(p), mirror_num);
-		if (ret)
-			break;
-		start += PAGE_SIZE;
-	}
-
-	return ret;
-}
-
 static int next_mirror(const struct io_failure_record *failrec, int cur_mirror)
 {
 	if (cur_mirror == failrec->num_copies)
@@ -691,7 +578,7 @@ int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start,
 	mirror = failrec->this_mirror;
 	do {
 		mirror = prev_mirror(failrec, mirror);
-		repair_io_failure(fs_info, ino, start, failrec->len,
+		btrfs_repair_io_failure(fs_info, ino, start, failrec->len,
 				  failrec->logical, page, pg_offset, mirror);
 	} while (mirror != failrec->failed_mirror);
 
@@ -822,7 +709,7 @@ int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_
 	 *
 	 * Since we're only doing repair for one sector, we only need to get
 	 * a good copy of the failed sector and if we succeed, we have setup
-	 * everything for repair_io_failure to do the rest for us.
+	 * everything for btrfs_repair_io_failure to do the rest for us.
 	 */
 	failrec->this_mirror = next_mirror(failrec, failrec->this_mirror);
 	if (failrec->this_mirror == failrec->failed_mirror) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index a0bafc7f6c07..9b486419854e 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -245,7 +245,6 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
 int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array);
 
 void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
-int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num);
 
 /*
  * When IO fails, either with EIO or csum verification fails, we
-- 
cgit 


From 1fe5ebc4e17dfbffeefe52abab75b02ad0f3a97e Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Tue, 15 Nov 2022 11:16:10 -0500
Subject: btrfs: move root helpers back into ctree.h

These accidentally got brought into accessors.h, but belong with the
btrfs_root definitions which are currently in ctree.h.  Move these to
make it easier to sync accessors.[ch] into btrfs-progs.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/accessors.h | 17 -----------------
 fs/btrfs/ctree.h     | 17 +++++++++++++++++
 2 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index cb59b69d2af1..57ba6894a5f4 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -712,23 +712,6 @@ BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item, otransid, 64);
 BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item, stransid, 64);
 BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item, rtransid, 64);
 
-static inline bool btrfs_root_readonly(const struct btrfs_root *root)
-{
-	/* Byte-swap the constant at compile time, root_item::flags is LE */
-	return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0;
-}
-
-static inline bool btrfs_root_dead(const struct btrfs_root *root)
-{
-	/* Byte-swap the constant at compile time, root_item::flags is LE */
-	return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0;
-}
-
-static inline u64 btrfs_root_id(const struct btrfs_root *root)
-{
-	return root->root_key.objectid;
-}
-
 /* struct btrfs_root_backup */
 BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
 		   tree_root, 64);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 99defab7e1f6..4e58b0532b36 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -335,6 +335,23 @@ struct btrfs_root {
 #endif
 };
 
+static inline bool btrfs_root_readonly(const struct btrfs_root *root)
+{
+	/* Byte-swap the constant at compile time, root_item::flags is LE */
+	return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0;
+}
+
+static inline bool btrfs_root_dead(const struct btrfs_root *root)
+{
+	/* Byte-swap the constant at compile time, root_item::flags is LE */
+	return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0;
+}
+
+static inline u64 btrfs_root_id(const struct btrfs_root *root)
+{
+	return root->root_key.objectid;
+}
+
 /*
  * Structure that conveys information about an extent that is going to replace
  * all the extents in a file range.
-- 
cgit 


From 3a3178c7f7675661d7bd0c67f57420e20982bd34 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Tue, 15 Nov 2022 11:16:11 -0500
Subject: btrfs: move leaf_data_end into ctree.c

This is only used in ctree.c, with the exception of zero'ing out extent
buffers we're getting ready to write out.  In theory we shouldn't have
an extent buffer with 0 items that we're writing out, however I'd rather
be safe than sorry so open code it in extent_io.c, and then copy the
helper into ctree.c.  This will make it easier to sync accessors.[ch]
into btrfs-progs, as this requires a helper that isn't defined in
accessors.h.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/accessors.h | 13 -------------
 fs/btrfs/ctree.c     | 13 +++++++++++++
 fs/btrfs/extent_io.c |  6 +++++-
 3 files changed, 18 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index 57ba6894a5f4..b9d9a69685df 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -897,19 +897,6 @@ const char *btrfs_super_csum_name(u16 csum_type);
 const char *btrfs_super_csum_driver(u16 csum_type);
 size_t __attribute_const__ btrfs_get_num_csums(void);
 
-/*
- * The leaf data grows from end-to-front in the node.  this returns the address
- * of the start of the last item, which is the stop of the leaf data stack.
- */
-static inline unsigned int leaf_data_end(const struct extent_buffer *leaf)
-{
-	u32 nr = btrfs_header_nritems(leaf);
-
-	if (nr == 0)
-		return BTRFS_LEAF_DATA_SIZE(leaf->fs_info);
-	return btrfs_item_offset(leaf, nr - 1);
-}
-
 /* struct btrfs_file_extent_item */
 BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item,
 			 type, 8);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index f75e398d7b71..dc38c24a0ffa 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -51,6 +51,19 @@ static const struct btrfs_csums {
 				     .driver = "blake2b-256" },
 };
 
+/*
+ * The leaf data grows from end-to-front in the node.  this returns the address
+ * of the start of the last item, which is the stop of the leaf data stack.
+ */
+static unsigned int leaf_data_end(const struct extent_buffer *leaf)
+{
+	u32 nr = btrfs_header_nritems(leaf);
+
+	if (nr == 0)
+		return BTRFS_LEAF_DATA_SIZE(leaf->fs_info);
+	return btrfs_item_offset(leaf, nr - 1);
+}
+
 int btrfs_super_csum_size(const struct btrfs_super_block *s)
 {
 	u16 t = btrfs_super_csum_type(s);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 8528e7d3f38f..9fc9f8068069 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2537,7 +2537,11 @@ static void prepare_eb_write(struct extent_buffer *eb)
 		 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
 		 */
 		start = btrfs_item_nr_offset(nritems);
-		end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb);
+		end = BTRFS_LEAF_DATA_OFFSET;
+		if (nritems == 0)
+			end += BTRFS_LEAF_DATA_SIZE(eb->fs_info);
+		else
+			end += btrfs_item_offset(eb, nritems - 1);
 		memzero_extent_buffer(eb, start, end - start);
 	}
 }
-- 
cgit 


From 6bfd0ffa6f2ae0ead92af7c4521626cd456115c5 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Tue, 15 Nov 2022 11:16:12 -0500
Subject: btrfs: move file_extent_item helpers into file-item.h

These helpers use functions that are in multiple places, which makes it
tricky to sync them into btrfs-progs.  Move them to file-item.h and then
include file-item.h in places that use these helpers.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/accessors.h    | 22 ----------------------
 fs/btrfs/ctree.c        |  1 +
 fs/btrfs/ctree.h        |  8 --------
 fs/btrfs/file-item.h    | 33 +++++++++++++++++++++++++++++++++
 fs/btrfs/tree-checker.c |  1 +
 5 files changed, 35 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index b9d9a69685df..f0d017f9407f 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -915,16 +915,6 @@ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes,
 BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression,
 			 struct btrfs_file_extent_item, compression, 8);
 
-static inline unsigned long btrfs_file_extent_inline_start(
-				const struct btrfs_file_extent_item *e)
-{
-	return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START;
-}
-
-static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
-{
-	return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize;
-}
 
 BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
 BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
@@ -946,18 +936,6 @@ BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
 BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
 		   other_encoding, 16);
 
-/*
- * Returns the number of bytes used by the item on disk, minus the size of any
- * extent headers.  If a file is compressed on disk, this is the compressed
- * size.
- */
-static inline u32 btrfs_file_extent_inline_item_len(
-						const struct extent_buffer *eb,
-						int nr)
-{
-	return btrfs_item_size(eb, nr) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
-}
-
 /* btrfs_qgroup_status_item */
 BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item,
 		   generation, 64);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index dc38c24a0ffa..e85b243be5a2 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -22,6 +22,7 @@
 #include "accessors.h"
 #include "extent-tree.h"
 #include "relocation.h"
+#include "file-item.h"
 
 static struct kmem_cache *btrfs_path_cachep;
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4e58b0532b36..428f51efd455 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -461,14 +461,6 @@ static inline u32 BTRFS_NODEPTRS_PER_BLOCK(const struct btrfs_fs_info *info)
 	return BTRFS_LEAF_DATA_SIZE(info) / sizeof(struct btrfs_key_ptr);
 }
 
-#define BTRFS_FILE_EXTENT_INLINE_DATA_START		\
-		(offsetof(struct btrfs_file_extent_item, disk_bytenr))
-static inline u32 BTRFS_MAX_INLINE_DATA_SIZE(const struct btrfs_fs_info *info)
-{
-	return BTRFS_MAX_ITEM_SIZE(info) -
-	       BTRFS_FILE_EXTENT_INLINE_DATA_START;
-}
-
 static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
 {
 	return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item);
diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h
index 95b371208b30..031225668434 100644
--- a/fs/btrfs/file-item.h
+++ b/fs/btrfs/file-item.h
@@ -3,6 +3,39 @@
 #ifndef BTRFS_FILE_ITEM_H
 #define BTRFS_FILE_ITEM_H
 
+#include "accessors.h"
+
+#define BTRFS_FILE_EXTENT_INLINE_DATA_START		\
+		(offsetof(struct btrfs_file_extent_item, disk_bytenr))
+
+static inline u32 BTRFS_MAX_INLINE_DATA_SIZE(const struct btrfs_fs_info *info)
+{
+	return BTRFS_MAX_ITEM_SIZE(info) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
+}
+
+/*
+ * Return the number of bytes used by the item on disk, minus the size of any
+ * extent headers.  If a file is compressed on disk, this is the compressed
+ * size.
+ */
+static inline u32 btrfs_file_extent_inline_item_len(
+						const struct extent_buffer *eb,
+						int nr)
+{
+	return btrfs_item_size(eb, nr) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
+}
+
+static inline unsigned long btrfs_file_extent_inline_start(
+				const struct btrfs_file_extent_item *e)
+{
+	return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START;
+}
+
+static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
+{
+	return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize;
+}
+
 int btrfs_del_csums(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, u64 bytenr, u64 len);
 blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 1c2d418dda6a..32e051101a27 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -28,6 +28,7 @@
 #include "btrfs_inode.h"
 #include "fs.h"
 #include "accessors.h"
+#include "file-item.h"
 
 /*
  * Error message should follow the following format:
-- 
cgit 


From 9b48addac406bd10afe447c2182b241dbde1d1a6 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Tue, 15 Nov 2022 11:16:13 -0500
Subject: btrfs: move eb offset helpers into extent_io.h

These are very specific to how the extent buffer is defined, so this
differs between btrfs-progs and the kernel.  Make things easier by
moving these helpers into extent_io.h so we don't have to worry about
this when syncing ctree.h.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h     | 33 ---------------------------------
 fs/btrfs/extent_io.h | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 428f51efd455..6169d3b28475 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -686,39 +686,6 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
 }
 int btrfs_leaf_free_space(struct extent_buffer *leaf);
 
-/*
- * Get the correct offset inside the page of extent buffer.
- *
- * @eb:		target extent buffer
- * @start:	offset inside the extent buffer
- *
- * Will handle both sectorsize == PAGE_SIZE and sectorsize < PAGE_SIZE cases.
- */
-static inline size_t get_eb_offset_in_page(const struct extent_buffer *eb,
-					   unsigned long offset)
-{
-	/*
-	 * For sectorsize == PAGE_SIZE case, eb->start will always be aligned
-	 * to PAGE_SIZE, thus adding it won't cause any difference.
-	 *
-	 * For sectorsize < PAGE_SIZE, we must only read the data that belongs
-	 * to the eb, thus we have to take the eb->start into consideration.
-	 */
-	return offset_in_page(offset + eb->start);
-}
-
-static inline unsigned long get_eb_page_index(unsigned long offset)
-{
-	/*
-	 * For sectorsize == PAGE_SIZE case, plain >> PAGE_SHIFT is enough.
-	 *
-	 * For sectorsize < PAGE_SIZE case, we only support 64K PAGE_SIZE,
-	 * and have ensured that all tree blocks are contained in one page,
-	 * thus we always get index == 0.
-	 */
-	return offset >> PAGE_SHIFT;
-}
-
 static inline int is_fstree(u64 rootid)
 {
 	if (rootid == BTRFS_FS_TREE_OBJECTID ||
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 9b486419854e..a2c82448b2e0 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -95,6 +95,39 @@ struct extent_buffer {
 #endif
 };
 
+/*
+ * Get the correct offset inside the page of extent buffer.
+ *
+ * @eb:		target extent buffer
+ * @start:	offset inside the extent buffer
+ *
+ * Will handle both sectorsize == PAGE_SIZE and sectorsize < PAGE_SIZE cases.
+ */
+static inline size_t get_eb_offset_in_page(const struct extent_buffer *eb,
+					   unsigned long offset)
+{
+	/*
+	 * For sectorsize == PAGE_SIZE case, eb->start will always be aligned
+	 * to PAGE_SIZE, thus adding it won't cause any difference.
+	 *
+	 * For sectorsize < PAGE_SIZE, we must only read the data that belongs
+	 * to the eb, thus we have to take the eb->start into consideration.
+	 */
+	return offset_in_page(offset + eb->start);
+}
+
+static inline unsigned long get_eb_page_index(unsigned long offset)
+{
+	/*
+	 * For sectorsize == PAGE_SIZE case, plain >> PAGE_SHIFT is enough.
+	 *
+	 * For sectorsize < PAGE_SIZE case, we only support 64K PAGE_SIZE,
+	 * and have ensured that all tree blocks are contained in one page,
+	 * thus we always get index == 0.
+	 */
+	return offset >> PAGE_SHIFT;
+}
+
 /*
  * Structure to record how many bytes and which ranges are set/cleared
  */
-- 
cgit 


From 0e6c40ebbb18b32cf4b5fcb3173eaede89b9f440 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Tue, 15 Nov 2022 11:16:14 -0500
Subject: btrfs: move the csum helpers into ctree.h

These got moved because of copy+paste, but this code exists in ctree.c,
so move the declarations back into ctree.h.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/accessors.h | 5 -----
 fs/btrfs/ctree.h     | 5 +++++
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index f0d017f9407f..066a662f38c3 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -892,11 +892,6 @@ BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64);
 BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
 			 uuid_tree_generation, 64);
 
-int btrfs_super_csum_size(const struct btrfs_super_block *s);
-const char *btrfs_super_csum_name(u16 csum_type);
-const char *btrfs_super_csum_driver(u16 csum_type);
-size_t __attribute_const__ btrfs_get_num_csums(void);
-
 /* struct btrfs_file_extent_item */
 BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item,
 			 type, 8);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6169d3b28475..6965703a81b6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -700,6 +700,11 @@ static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
 	return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID;
 }
 
+int btrfs_super_csum_size(const struct btrfs_super_block *s);
+const char *btrfs_super_csum_name(u16 csum_type);
+const char *btrfs_super_csum_driver(u16 csum_type);
+size_t __attribute_const__ btrfs_get_num_csums(void);
+
 /*
  * We use page status Private2 to indicate there is an ordered extent with
  * unfinished IO.
-- 
cgit 


From 42c9419a4c01910e9c46b0c2bb9090f76295bf01 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Tue, 15 Nov 2022 11:16:15 -0500
Subject: btrfs: pass the extent buffer for the btrfs_item_nr helpers

This is actually a change for extent tree v2, but it exists in
btrfs-progs but not in the kernel.  This makes it annoying to sync
accessors.h with btrfs-progs, and since this is the way I need it for
extent-tree v2 simply update these helpers to take the extent buffer in
order to make syncing possible now, and make the extent tree v2 stuff
easier moving forward.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/accessors.h    | 18 +++++++++---------
 fs/btrfs/ctree.c        | 35 ++++++++++++++++++-----------------
 fs/btrfs/extent_io.c    |  2 +-
 fs/btrfs/tree-checker.c |  4 ++--
 4 files changed, 30 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index 066a662f38c3..2b4fb696142b 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -417,37 +417,37 @@ BTRFS_SETGET_FUNCS(raw_item_size, struct btrfs_item, size, 32);
 BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32);
 BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32);
 
-static inline unsigned long btrfs_item_nr_offset(int nr)
+static inline unsigned long btrfs_item_nr_offset(const struct extent_buffer *eb, int nr)
 {
 	return offsetof(struct btrfs_leaf, items) +
 		sizeof(struct btrfs_item) * nr;
 }
 
-static inline struct btrfs_item *btrfs_item_nr(int nr)
+static inline struct btrfs_item *btrfs_item_nr(const struct extent_buffer *eb, int nr)
 {
-	return (struct btrfs_item *)btrfs_item_nr_offset(nr);
+	return (struct btrfs_item *)btrfs_item_nr_offset(eb, nr);
 }
 
 #define BTRFS_ITEM_SETGET_FUNCS(member)						\
 static inline u32 btrfs_item_##member(const struct extent_buffer *eb, int slot)	\
 {										\
-	return btrfs_raw_item_##member(eb, btrfs_item_nr(slot));		\
+	return btrfs_raw_item_##member(eb, btrfs_item_nr(eb, slot));		\
 }										\
 static inline void btrfs_set_item_##member(const struct extent_buffer *eb,	\
 					   int slot, u32 val)			\
 {										\
-	btrfs_set_raw_item_##member(eb, btrfs_item_nr(slot), val);		\
+	btrfs_set_raw_item_##member(eb, btrfs_item_nr(eb, slot), val);		\
 }										\
 static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token,	\
 					    int slot)				\
 {										\
-	struct btrfs_item *item = btrfs_item_nr(slot);				\
+	struct btrfs_item *item = btrfs_item_nr(token->eb, slot);		\
 	return btrfs_token_raw_item_##member(token, item);			\
 }										\
 static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token,	\
 						 int slot, u32 val)		\
 {										\
-	struct btrfs_item *item = btrfs_item_nr(slot);				\
+	struct btrfs_item *item = btrfs_item_nr(token->eb, slot);		\
 	btrfs_set_token_raw_item_##member(token, item, val);			\
 }
 
@@ -462,7 +462,7 @@ static inline u32 btrfs_item_data_end(const struct extent_buffer *eb, int nr)
 static inline void btrfs_item_key(const struct extent_buffer *eb,
 			   struct btrfs_disk_key *disk_key, int nr)
 {
-	struct btrfs_item *item = btrfs_item_nr(nr);
+	struct btrfs_item *item = btrfs_item_nr(eb, nr);
 
 	read_eb_member(eb, item, struct btrfs_item, key, disk_key);
 }
@@ -470,7 +470,7 @@ static inline void btrfs_item_key(const struct extent_buffer *eb,
 static inline void btrfs_set_item_key(struct extent_buffer *eb,
 				      struct btrfs_disk_key *disk_key, int nr)
 {
-	struct btrfs_item *item = btrfs_item_nr(nr);
+	struct btrfs_item *item = btrfs_item_nr(eb, nr);
 
 	write_eb_member(eb, item, struct btrfs_item, key, disk_key);
 }
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index e85b243be5a2..cdc112d3bab2 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -3033,13 +3033,13 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
 		     BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
 		     push_space);
 
-	memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
-			      btrfs_item_nr_offset(0),
+	memmove_extent_buffer(right, btrfs_item_nr_offset(right, push_items),
+			      btrfs_item_nr_offset(right, 0),
 			      right_nritems * sizeof(struct btrfs_item));
 
 	/* copy the items from left to right */
-	copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
-		   btrfs_item_nr_offset(left_nritems - push_items),
+	copy_extent_buffer(right, left, btrfs_item_nr_offset(right, 0),
+		   btrfs_item_nr_offset(left, left_nritems - push_items),
 		   push_items * sizeof(struct btrfs_item));
 
 	/* update the item pointers */
@@ -3233,8 +3233,8 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
 
 	/* push data from right to left */
 	copy_extent_buffer(left, right,
-			   btrfs_item_nr_offset(btrfs_header_nritems(left)),
-			   btrfs_item_nr_offset(0),
+			   btrfs_item_nr_offset(left, btrfs_header_nritems(left)),
+			   btrfs_item_nr_offset(right, 0),
 			   push_items * sizeof(struct btrfs_item));
 
 	push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
@@ -3272,8 +3272,8 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
 				      BTRFS_LEAF_DATA_OFFSET +
 				      leaf_data_end(right), push_space);
 
-		memmove_extent_buffer(right, btrfs_item_nr_offset(0),
-			      btrfs_item_nr_offset(push_items),
+		memmove_extent_buffer(right, btrfs_item_nr_offset(right, 0),
+			      btrfs_item_nr_offset(left, push_items),
 			     (btrfs_header_nritems(right) - push_items) *
 			     sizeof(struct btrfs_item));
 	}
@@ -3407,8 +3407,8 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
 	btrfs_set_header_nritems(right, nritems);
 	data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l);
 
-	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
-			   btrfs_item_nr_offset(mid),
+	copy_extent_buffer(right, l, btrfs_item_nr_offset(right, 0),
+			   btrfs_item_nr_offset(l, mid),
 			   nritems * sizeof(struct btrfs_item));
 
 	copy_extent_buffer(right, l,
@@ -3784,8 +3784,8 @@ static noinline int split_item(struct btrfs_path *path,
 	nritems = btrfs_header_nritems(leaf);
 	if (slot != nritems) {
 		/* shift the items */
-		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
-				btrfs_item_nr_offset(slot),
+		memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, slot + 1),
+				btrfs_item_nr_offset(leaf, slot),
 				(nritems - slot) * sizeof(struct btrfs_item));
 	}
 
@@ -4077,9 +4077,10 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
 						       ioff - batch->total_data_size);
 		}
 		/* shift the items */
-		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr),
-			      btrfs_item_nr_offset(slot),
-			      (nritems - slot) * sizeof(struct btrfs_item));
+		memmove_extent_buffer(leaf,
+				      btrfs_item_nr_offset(leaf, slot + batch->nr),
+				      btrfs_item_nr_offset(leaf, slot),
+				      (nritems - slot) * sizeof(struct btrfs_item));
 
 		/* shift the data */
 		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
@@ -4333,8 +4334,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			btrfs_set_token_item_offset(&token, i, ioff + dsize);
 		}
 
-		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
-			      btrfs_item_nr_offset(slot + nr),
+		memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, slot),
+			      btrfs_item_nr_offset(leaf, slot + nr),
 			      sizeof(struct btrfs_item) *
 			      (nritems - slot - nr));
 	}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 9fc9f8068069..f1df13c5fc6f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2536,7 +2536,7 @@ static void prepare_eb_write(struct extent_buffer *eb)
 		 * Leaf:
 		 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
 		 */
-		start = btrfs_item_nr_offset(nritems);
+		start = btrfs_item_nr_offset(eb, nritems);
 		end = BTRFS_LEAF_DATA_OFFSET;
 		if (nritems == 0)
 			end += BTRFS_LEAF_DATA_SIZE(eb->fs_info);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 32e051101a27..baad1ed7e111 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -1784,10 +1784,10 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
 
 		/* Also check if the item pointer overlaps with btrfs item. */
 		if (unlikely(btrfs_item_ptr_offset(leaf, slot) <
-			     btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item))) {
+			     btrfs_item_nr_offset(leaf, slot) + sizeof(struct btrfs_item))) {
 			generic_err(leaf, slot,
 		"slot overlaps with its data, item end %lu data start %lu",
-				btrfs_item_nr_offset(slot) +
+				btrfs_item_nr_offset(leaf, slot) +
 				sizeof(struct btrfs_item),
 				btrfs_item_ptr_offset(leaf, slot));
 			return -EUCLEAN;
-- 
cgit 


From e23efd8e8767165a6103cf0a4fe273f6b9f182f2 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Tue, 15 Nov 2022 11:16:16 -0500
Subject: btrfs: add eb to btrfs_node_key_ptr_offset

This is a change needed for extent tree v2, as we will be growing the
header size.  This exists in btrfs-progs currently, and not having it
makes syncing accessors.[ch] more problematic.  So make this change to
set us up for extent tree v2 and match what btrfs-progs does to make
syncing easier.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/accessors.c    |  2 +-
 fs/btrfs/accessors.h    |  4 ++--
 fs/btrfs/ctree.c        | 28 ++++++++++++++--------------
 fs/btrfs/extent_io.c    |  2 +-
 fs/btrfs/tree-mod-log.c |  4 ++--
 5 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c
index 7a7b7d263102..206cf1612c1d 100644
--- a/fs/btrfs/accessors.c
+++ b/fs/btrfs/accessors.c
@@ -168,7 +168,7 @@ DEFINE_BTRFS_SETGET_BITS(64)
 void btrfs_node_key(const struct extent_buffer *eb,
 		    struct btrfs_disk_key *disk_key, int nr)
 {
-	unsigned long ptr = btrfs_node_key_ptr_offset(nr);
+	unsigned long ptr = btrfs_node_key_ptr_offset(eb, nr);
 	read_eb_member(eb, (struct btrfs_key_ptr *)ptr,
 		       struct btrfs_key_ptr, key, disk_key);
 }
diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index 2b4fb696142b..88eea44fdd7f 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -392,7 +392,7 @@ static inline void btrfs_set_node_ptr_generation(const struct extent_buffer *eb,
 	btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val);
 }
 
-static inline unsigned long btrfs_node_key_ptr_offset(int nr)
+static inline unsigned long btrfs_node_key_ptr_offset(const struct extent_buffer *eb, int nr)
 {
 	return offsetof(struct btrfs_node, ptrs) +
 		sizeof(struct btrfs_key_ptr) * nr;
@@ -406,7 +406,7 @@ static inline void btrfs_set_node_key(const struct extent_buffer *eb,
 {
 	unsigned long ptr;
 
-	ptr = btrfs_node_key_ptr_offset(nr);
+	ptr = btrfs_node_key_ptr_offset(eb, nr);
 	write_eb_member(eb, (struct btrfs_key_ptr *)ptr,
 		        struct btrfs_key_ptr, key, disk_key);
 }
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index cdc112d3bab2..32facf8d7319 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2612,8 +2612,8 @@ static int push_node_left(struct btrfs_trans_handle *trans,
 		return ret;
 	}
 	copy_extent_buffer(dst, src,
-			   btrfs_node_key_ptr_offset(dst_nritems),
-			   btrfs_node_key_ptr_offset(0),
+			   btrfs_node_key_ptr_offset(dst, dst_nritems),
+			   btrfs_node_key_ptr_offset(src, 0),
 			   push_items * sizeof(struct btrfs_key_ptr));
 
 	if (push_items < src_nritems) {
@@ -2621,8 +2621,8 @@ static int push_node_left(struct btrfs_trans_handle *trans,
 		 * Don't call btrfs_tree_mod_log_insert_move() here, key removal
 		 * was already fully logged by btrfs_tree_mod_log_eb_copy() above.
 		 */
-		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
-				      btrfs_node_key_ptr_offset(push_items),
+		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(src, 0),
+				      btrfs_node_key_ptr_offset(src, push_items),
 				      (src_nritems - push_items) *
 				      sizeof(struct btrfs_key_ptr));
 	}
@@ -2682,8 +2682,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 	}
 	ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
 	BUG_ON(ret < 0);
-	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
-				      btrfs_node_key_ptr_offset(0),
+	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst, push_items),
+				      btrfs_node_key_ptr_offset(dst, 0),
 				      (dst_nritems) *
 				      sizeof(struct btrfs_key_ptr));
 
@@ -2694,8 +2694,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 		return ret;
 	}
 	copy_extent_buffer(dst, src,
-			   btrfs_node_key_ptr_offset(0),
-			   btrfs_node_key_ptr_offset(src_nritems - push_items),
+			   btrfs_node_key_ptr_offset(dst, 0),
+			   btrfs_node_key_ptr_offset(src, src_nritems - push_items),
 			   push_items * sizeof(struct btrfs_key_ptr));
 
 	btrfs_set_header_nritems(src, src_nritems - push_items);
@@ -2798,8 +2798,8 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
 			BUG_ON(ret < 0);
 		}
 		memmove_extent_buffer(lower,
-			      btrfs_node_key_ptr_offset(slot + 1),
-			      btrfs_node_key_ptr_offset(slot),
+			      btrfs_node_key_ptr_offset(lower, slot + 1),
+			      btrfs_node_key_ptr_offset(lower, slot),
 			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
 	}
 	if (level) {
@@ -2881,8 +2881,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 		return ret;
 	}
 	copy_extent_buffer(split, c,
-			   btrfs_node_key_ptr_offset(0),
-			   btrfs_node_key_ptr_offset(mid),
+			   btrfs_node_key_ptr_offset(split, 0),
+			   btrfs_node_key_ptr_offset(c, mid),
 			   (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
 	btrfs_set_header_nritems(split, c_nritems - mid);
 	btrfs_set_header_nritems(c, mid);
@@ -4240,8 +4240,8 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
 			BUG_ON(ret < 0);
 		}
 		memmove_extent_buffer(parent,
-			      btrfs_node_key_ptr_offset(slot),
-			      btrfs_node_key_ptr_offset(slot + 1),
+			      btrfs_node_key_ptr_offset(parent, slot),
+			      btrfs_node_key_ptr_offset(parent, slot + 1),
 			      sizeof(struct btrfs_key_ptr) *
 			      (nritems - slot - 1));
 	} else if (level) {
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index f1df13c5fc6f..f1d5a851968f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2529,7 +2529,7 @@ static void prepare_eb_write(struct extent_buffer *eb)
 	/* Set btree blocks beyond nritems with 0 to avoid stale content */
 	nritems = btrfs_header_nritems(eb);
 	if (btrfs_header_level(eb) > 0) {
-		end = btrfs_node_key_ptr_offset(nritems);
+		end = btrfs_node_key_ptr_offset(eb, nritems);
 		memzero_extent_buffer(eb, end, eb->len - end);
 	} else {
 		/*
diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c
index 146a6b198933..a555baa0143a 100644
--- a/fs/btrfs/tree-mod-log.c
+++ b/fs/btrfs/tree-mod-log.c
@@ -697,8 +697,8 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
 			n--;
 			break;
 		case BTRFS_MOD_LOG_MOVE_KEYS:
-			o_dst = btrfs_node_key_ptr_offset(tm->slot);
-			o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
+			o_dst = btrfs_node_key_ptr_offset(eb, tm->slot);
+			o_src = btrfs_node_key_ptr_offset(eb, tm->move.dst_slot);
 			memmove_extent_buffer(eb, o_dst, o_src,
 					      tm->move.nr_items * p_size);
 			break;
-- 
cgit 


From 637e3b48c22e52daa645b49b73630af3f09328ae Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Tue, 15 Nov 2022 11:16:17 -0500
Subject: btrfs: add helpers for manipulating leaf items and data

We have some gnarly memmove and copy_extent_buffer calls for leaf
manipulation.  This is because our item offsets aren't absolute, they're
based on 0 being where the items start in the leaf, which is after the
btrfs_header.  This means any manipulation of the data requires adding
sizeof(struct btrfs_header) to the offsets we pull from the items.
Moving the items themselves is easier as the helpers are absolute
offsets, however we of course have to call the helpers to get the
offsets for the item numbers.  This makes for
copy_extent_buffer/memmove_extent_buffer calls that are kind of hard to
reason about what's happening.

Fix this by pushing this logic into helpers.  For data we'll only use
the item provided offsets, and the helpers will use the
BTRFS_LEAF_DATA_OFFSET addition for the offsets.  Additionally for the
item manipulation simply pass in the item numbers, and then the helpers
will call the offset helper to get the actual offset into the leaf.

The diffstat makes this look like more code, but that's simply because I
added comments for the helpers, it's net negative for the amount of
code, and is easier to reason.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 181 +++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 115 insertions(+), 66 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 32facf8d7319..b5d203045dc7 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -65,6 +65,91 @@ static unsigned int leaf_data_end(const struct extent_buffer *leaf)
 	return btrfs_item_offset(leaf, nr - 1);
 }
 
+/*
+ * Move data in a @leaf (using memmove, safe for overlapping ranges).
+ *
+ * @leaf:	leaf that we're doing a memmove on
+ * @dst_offset:	item data offset we're moving to
+ * @src_offset:	item data offset were' moving from
+ * @len:	length of the data we're moving
+ *
+ * Wrapper around memmove_extent_buffer() that takes into account the header on
+ * the leaf.  The btrfs_item offset's start directly after the header, so we
+ * have to adjust any offsets to account for the header in the leaf.  This
+ * handles that math to simplify the callers.
+ */
+static inline void memmove_leaf_data(const struct extent_buffer *leaf,
+				     unsigned long dst_offset,
+				     unsigned long src_offset,
+				     unsigned long len)
+{
+	memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + dst_offset,
+			      BTRFS_LEAF_DATA_OFFSET + src_offset, len);
+}
+
+/*
+ * Copy item data from @src into @dst at the given @offset.
+ *
+ * @dst:	destination leaf that we're copying into
+ * @src:	source leaf that we're copying from
+ * @dst_offset:	item data offset we're copying to
+ * @src_offset:	item data offset were' copying from
+ * @len:	length of the data we're copying
+ *
+ * Wrapper around copy_extent_buffer() that takes into account the header on
+ * the leaf.  The btrfs_item offset's start directly after the header, so we
+ * have to adjust any offsets to account for the header in the leaf.  This
+ * handles that math to simplify the callers.
+ */
+static inline void copy_leaf_data(const struct extent_buffer *dst,
+				  const struct extent_buffer *src,
+				  unsigned long dst_offset,
+				  unsigned long src_offset, unsigned long len)
+{
+	copy_extent_buffer(dst, src, BTRFS_LEAF_DATA_OFFSET + dst_offset,
+			   BTRFS_LEAF_DATA_OFFSET + src_offset, len);
+}
+
+/*
+ * Move items in a @leaf (using memmove).
+ *
+ * @dst:	destination leaf for the items
+ * @dst_item:	the item nr we're copying into
+ * @src_item:	the item nr we're copying from
+ * @nr_items:	the number of items to copy
+ *
+ * Wrapper around memmove_extent_buffer() that does the math to get the
+ * appropriate offsets into the leaf from the item numbers.
+ */
+static inline void memmove_leaf_items(const struct extent_buffer *leaf,
+				      int dst_item, int src_item, int nr_items)
+{
+	memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, dst_item),
+			      btrfs_item_nr_offset(leaf, src_item),
+			      nr_items * sizeof(struct btrfs_item));
+}
+
+/*
+ * Copy items from @src into @dst at the given @offset.
+ *
+ * @dst:	destination leaf for the items
+ * @src:	source leaf for the items
+ * @dst_item:	the item nr we're copying into
+ * @src_item:	the item nr we're copying from
+ * @nr_items:	the number of items to copy
+ *
+ * Wrapper around copy_extent_buffer() that does the math to get the
+ * appropriate offsets into the leaf from the item numbers.
+ */
+static inline void copy_leaf_items(const struct extent_buffer *dst,
+				   const struct extent_buffer *src,
+				   int dst_item, int src_item, int nr_items)
+{
+	copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, dst_item),
+			      btrfs_item_nr_offset(src, src_item),
+			      nr_items * sizeof(struct btrfs_item));
+}
+
 int btrfs_super_csum_size(const struct btrfs_super_block *s)
 {
 	u16 t = btrfs_super_csum_type(s);
@@ -3022,25 +3107,17 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
 
 	/* make room in the right data area */
 	data_end = leaf_data_end(right);
-	memmove_extent_buffer(right,
-			      BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
-			      BTRFS_LEAF_DATA_OFFSET + data_end,
-			      BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
+	memmove_leaf_data(right, data_end - push_space, data_end,
+			  BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
 
 	/* copy from the left data area */
-	copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
-		     BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
-		     BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
-		     push_space);
+	copy_leaf_data(right, left, BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
+		       leaf_data_end(left), push_space);
 
-	memmove_extent_buffer(right, btrfs_item_nr_offset(right, push_items),
-			      btrfs_item_nr_offset(right, 0),
-			      right_nritems * sizeof(struct btrfs_item));
+	memmove_leaf_items(right, push_items, 0, right_nritems);
 
 	/* copy the items from left to right */
-	copy_extent_buffer(right, left, btrfs_item_nr_offset(right, 0),
-		   btrfs_item_nr_offset(left, left_nritems - push_items),
-		   push_items * sizeof(struct btrfs_item));
+	copy_leaf_items(right, left, 0, left_nritems - push_items, push_items);
 
 	/* update the item pointers */
 	btrfs_init_map_token(&token, right);
@@ -3232,19 +3309,13 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
 	WARN_ON(!empty && push_items == btrfs_header_nritems(right));
 
 	/* push data from right to left */
-	copy_extent_buffer(left, right,
-			   btrfs_item_nr_offset(left, btrfs_header_nritems(left)),
-			   btrfs_item_nr_offset(right, 0),
-			   push_items * sizeof(struct btrfs_item));
+	copy_leaf_items(left, right, btrfs_header_nritems(left), 0, push_items);
 
 	push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
 		     btrfs_item_offset(right, push_items - 1);
 
-	copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
-		     leaf_data_end(left) - push_space,
-		     BTRFS_LEAF_DATA_OFFSET +
-		     btrfs_item_offset(right, push_items - 1),
-		     push_space);
+	copy_leaf_data(left, right, leaf_data_end(left) - push_space,
+		       btrfs_item_offset(right, push_items - 1), push_space);
 	old_left_nritems = btrfs_header_nritems(left);
 	BUG_ON(old_left_nritems <= 0);
 
@@ -3267,15 +3338,12 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
 	if (push_items < right_nritems) {
 		push_space = btrfs_item_offset(right, push_items - 1) -
 						  leaf_data_end(right);
-		memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
-				      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
-				      BTRFS_LEAF_DATA_OFFSET +
-				      leaf_data_end(right), push_space);
+		memmove_leaf_data(right,
+				  BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
+				  leaf_data_end(right), push_space);
 
-		memmove_extent_buffer(right, btrfs_item_nr_offset(right, 0),
-			      btrfs_item_nr_offset(left, push_items),
-			     (btrfs_header_nritems(right) - push_items) *
-			     sizeof(struct btrfs_item));
+		memmove_leaf_items(right, 0, push_items,
+				   btrfs_header_nritems(right) - push_items);
 	}
 
 	btrfs_init_map_token(&token, right);
@@ -3407,14 +3475,10 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
 	btrfs_set_header_nritems(right, nritems);
 	data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l);
 
-	copy_extent_buffer(right, l, btrfs_item_nr_offset(right, 0),
-			   btrfs_item_nr_offset(l, mid),
-			   nritems * sizeof(struct btrfs_item));
+	copy_leaf_items(right, l, 0, mid, nritems);
 
-	copy_extent_buffer(right, l,
-		     BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
-		     data_copy_size, BTRFS_LEAF_DATA_OFFSET +
-		     leaf_data_end(l), data_copy_size);
+	copy_leaf_data(right, l, BTRFS_LEAF_DATA_SIZE(fs_info) - data_copy_size,
+		       leaf_data_end(l), data_copy_size);
 
 	rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid);
 
@@ -3784,9 +3848,7 @@ static noinline int split_item(struct btrfs_path *path,
 	nritems = btrfs_header_nritems(leaf);
 	if (slot != nritems) {
 		/* shift the items */
-		memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, slot + 1),
-				btrfs_item_nr_offset(leaf, slot),
-				(nritems - slot) * sizeof(struct btrfs_item));
+		memmove_leaf_items(leaf, slot + 1, slot, nritems - slot);
 	}
 
 	btrfs_cpu_key_to_disk(&disk_key, new_key);
@@ -3897,9 +3959,8 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
 
 	/* shift the data */
 	if (from_end) {
-		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
-			      data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
-			      data_end, old_data_start + new_size - data_end);
+		memmove_leaf_data(leaf, data_end + size_diff, data_end,
+				  old_data_start + new_size - data_end);
 	} else {
 		struct btrfs_disk_key disk_key;
 		u64 offset;
@@ -3924,9 +3985,8 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
 			}
 		}
 
-		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
-			      data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
-			      data_end, old_data_start - data_end);
+		memmove_leaf_data(leaf, data_end + size_diff, data_end,
+				  old_data_start - data_end);
 
 		offset = btrfs_disk_key_offset(&disk_key);
 		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
@@ -3991,9 +4051,8 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
 	}
 
 	/* shift the data */
-	memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
-		      data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
-		      data_end, old_data - data_end);
+	memmove_leaf_data(leaf, data_end - data_size, data_end,
+			  old_data - data_end);
 
 	data_end = old_data;
 	old_size = btrfs_item_size(leaf, slot);
@@ -4077,16 +4136,11 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
 						       ioff - batch->total_data_size);
 		}
 		/* shift the items */
-		memmove_extent_buffer(leaf,
-				      btrfs_item_nr_offset(leaf, slot + batch->nr),
-				      btrfs_item_nr_offset(leaf, slot),
-				      (nritems - slot) * sizeof(struct btrfs_item));
+		memmove_leaf_items(leaf, slot + batch->nr, slot, nritems - slot);
 
 		/* shift the data */
-		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
-				      data_end - batch->total_data_size,
-				      BTRFS_LEAF_DATA_OFFSET + data_end,
-				      old_data - data_end);
+		memmove_leaf_data(leaf, data_end - batch->total_data_size,
+				  data_end, old_data - data_end);
 		data_end = old_data;
 	}
 
@@ -4321,10 +4375,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		for (i = 0; i < nr; i++)
 			dsize += btrfs_item_size(leaf, slot + i);
 
-		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
-			      data_end + dsize,
-			      BTRFS_LEAF_DATA_OFFSET + data_end,
-			      last_off - data_end);
+		memmove_leaf_data(leaf, data_end + dsize, data_end,
+				  last_off - data_end);
 
 		btrfs_init_map_token(&token, leaf);
 		for (i = slot + nr; i < nritems; i++) {
@@ -4334,10 +4386,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			btrfs_set_token_item_offset(&token, i, ioff + dsize);
 		}
 
-		memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, slot),
-			      btrfs_item_nr_offset(leaf, slot + nr),
-			      sizeof(struct btrfs_item) *
-			      (nritems - slot - nr));
+		memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr);
 	}
 	btrfs_set_header_nritems(leaf, nritems - nr);
 	nritems -= nr;
-- 
cgit 


From 8009adf306452e9b43db36f2d02fedfe5eca1b5e Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Tue, 15 Nov 2022 11:16:18 -0500
Subject: btrfs: remove BTRFS_LEAF_DATA_OFFSET

This is simply the same thing as btrfs_item_nr_offset(leaf, 0), so
remove this helper and replace it's usage with the above statement.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/accessors.h | 6 ++----
 fs/btrfs/ctree.c     | 8 ++++----
 fs/btrfs/extent_io.c | 2 +-
 3 files changed, 7 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index 88eea44fdd7f..e6228ff73c81 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -9,8 +9,6 @@ struct btrfs_map_token {
 	unsigned long offset;
 };
 
-#define BTRFS_LEAF_DATA_OFFSET		offsetof(struct btrfs_leaf, items)
-
 void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb);
 
 /*
@@ -1028,9 +1026,9 @@ BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size,
 
 /* Cast into the data area of the leaf. */
 #define btrfs_item_ptr(leaf, slot, type)				\
-	((type *)(BTRFS_LEAF_DATA_OFFSET + btrfs_item_offset(leaf, slot)))
+	((type *)(btrfs_item_nr_offset(leaf, 0) + btrfs_item_offset(leaf, slot)))
 
 #define btrfs_item_ptr_offset(leaf, slot)				\
-	((unsigned long)(BTRFS_LEAF_DATA_OFFSET + btrfs_item_offset(leaf, slot)))
+	((unsigned long)(btrfs_item_nr_offset(leaf, 0) + btrfs_item_offset(leaf, slot)))
 
 #endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index b5d203045dc7..76b99bcc849d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -83,8 +83,8 @@ static inline void memmove_leaf_data(const struct extent_buffer *leaf,
 				     unsigned long src_offset,
 				     unsigned long len)
 {
-	memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + dst_offset,
-			      BTRFS_LEAF_DATA_OFFSET + src_offset, len);
+	memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, 0) + dst_offset,
+			      btrfs_item_nr_offset(leaf, 0) + src_offset, len);
 }
 
 /*
@@ -106,8 +106,8 @@ static inline void copy_leaf_data(const struct extent_buffer *dst,
 				  unsigned long dst_offset,
 				  unsigned long src_offset, unsigned long len)
 {
-	copy_extent_buffer(dst, src, BTRFS_LEAF_DATA_OFFSET + dst_offset,
-			   BTRFS_LEAF_DATA_OFFSET + src_offset, len);
+	copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, 0) + dst_offset,
+			   btrfs_item_nr_offset(src, 0) + src_offset, len);
 }
 
 /*
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index f1d5a851968f..83dd3aa59663 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2537,7 +2537,7 @@ static void prepare_eb_write(struct extent_buffer *eb)
 		 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
 		 */
 		start = btrfs_item_nr_offset(eb, nritems);
-		end = BTRFS_LEAF_DATA_OFFSET;
+		end = btrfs_item_nr_offset(eb, 0);
 		if (nritems == 0)
 			end += BTRFS_LEAF_DATA_SIZE(eb->fs_info);
 		else
-- 
cgit 


From 0c7030038e6106711c5d0b237c980905dd3244ec Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Tue, 15 Nov 2022 11:16:19 -0500
Subject: btrfs: add nr_global_roots to the super block definition

We already have this defined in btrfs-progs, add it to the kernel to
make it easier to sync these files into btrfs-progs.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/accessors.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index e6228ff73c81..75c181b579eb 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -889,6 +889,8 @@ BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block,
 BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64);
 BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
 			 uuid_tree_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_nr_global_roots, struct btrfs_super_block,
+			 nr_global_roots, 64);
 
 /* struct btrfs_file_extent_item */
 BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item,
-- 
cgit 


From 054056bd0a8da2747e51646a6ac5af6ffbae5b69 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Tue, 15 Nov 2022 11:16:20 -0500
Subject: btrfs: add stack helpers for a few btrfs items

We don't have these defined in the kernel because we don't have any
users of these helpers.  However we do use them in btrfs-progs, so
define them to make keeping accessors.h in sync between progs and the
kernel easier.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/accessors.h | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index 75c181b579eb..ceadfc5d6c66 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -221,12 +221,26 @@ static inline u64 btrfs_stripe_offset_nr(const struct extent_buffer *eb,
 	return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr));
 }
 
+static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb,
+					      struct btrfs_chunk *c, int nr,
+					      u64 val)
+{
+	btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val);
+}
+
 static inline u64 btrfs_stripe_devid_nr(const struct extent_buffer *eb,
 					 struct btrfs_chunk *c, int nr)
 {
 	return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr));
 }
 
+static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb,
+					     struct btrfs_chunk *c, int nr,
+					     u64 val)
+{
+	btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val);
+}
+
 /* struct btrfs_block_group_item */
 BTRFS_SETGET_STACK_FUNCS(stack_block_group_used, struct btrfs_block_group_item,
 			 used, 64);
@@ -248,6 +262,8 @@ BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32);
 /* struct btrfs_inode_ref */
 BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
 BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_ref_index, struct btrfs_inode_ref, index, 64);
 
 /* struct btrfs_inode_extref */
 BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref,
@@ -297,6 +313,14 @@ BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
 BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent,
 		   chunk_offset, 64);
 BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_chunk_tree, struct btrfs_dev_extent,
+			 chunk_tree, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_chunk_objectid, struct btrfs_dev_extent,
+			 chunk_objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_chunk_offset, struct btrfs_dev_extent,
+			 chunk_offset, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_length, struct btrfs_dev_extent, length, 64);
+
 BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64);
 BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item, generation, 64);
 BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64);
@@ -479,6 +503,9 @@ BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64);
 BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64);
 BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64);
 BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_root_ref_dirid, struct btrfs_root_ref, dirid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_root_ref_sequence, struct btrfs_root_ref, sequence, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_root_ref_name_len, struct btrfs_root_ref, name_len, 16);
 
 /* struct btrfs_dir_item */
 BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16);
@@ -972,6 +999,16 @@ BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item,
 		   rsv_rfer, 64);
 BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item,
 		   rsv_excl, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_flags,
+			 struct btrfs_qgroup_limit_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_max_rfer,
+			 struct btrfs_qgroup_limit_item, max_rfer, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_max_excl,
+			 struct btrfs_qgroup_limit_item, max_excl, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_rsv_rfer,
+			 struct btrfs_qgroup_limit_item, rsv_rfer, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_rsv_excl,
+			 struct btrfs_qgroup_limit_item, rsv_excl, 64);
 
 /* btrfs_dev_replace_item */
 BTRFS_SETGET_FUNCS(dev_replace_src_devid,
-- 
cgit 


From a4c853af0c511d7e0f7cb306bbc8a4f1dbdb64ca Mon Sep 17 00:00:00 2001
From: ChenXiaoSong <chenxiaosong2@huawei.com>
Date: Wed, 16 Nov 2022 22:23:53 +0800
Subject: btrfs: add might_sleep() annotations

Add annotations to functions that might sleep due to allocations or IO
and could be called from various contexts. In case of btrfs_search_slot
it's not obvious why it would sleep:

    btrfs_search_slot
      setup_nodes_for_search
        reada_for_balance
          btrfs_readahead_node_child
            btrfs_readahead_tree_block
              btrfs_find_create_tree_block
                alloc_extent_buffer
                  kmem_cache_zalloc
                    /* allocate memory non-atomically, might sleep */
                    kmem_cache_alloc(GFP_NOFS|__GFP_NOFAIL|__GFP_ZERO)
              read_extent_buffer_pages
                submit_extent_page
                  /* disk IO, might sleep */
                  submit_one_bio

Other examples where the sleeping could happen is in 3 places might
sleep in update_qgroup_limit_item(), as shown below:

  update_qgroup_limit_item
    btrfs_alloc_path
      /* allocate memory non-atomically, might sleep */
      kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS)

Signed-off-by: ChenXiaoSong <chenxiaosong2@huawei.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 76b99bcc849d..4754c9101a4c 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -184,6 +184,8 @@ size_t __attribute_const__ btrfs_get_num_csums(void)
 
 struct btrfs_path *btrfs_alloc_path(void)
 {
+	might_sleep();
+
 	return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
 }
 
@@ -2046,6 +2048,8 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	int min_write_lock_level;
 	int prev_cmp;
 
+	might_sleep();
+
 	lowest_level = p->lowest_level;
 	WARN_ON(lowest_level && ins_len > 0);
 	WARN_ON(p->nodes[0] != NULL);
-- 
cgit 


From d7c9e1be2876f63fb2178a24e0c1d5733ff98d47 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 18 Nov 2022 15:06:09 -0500
Subject: btrfs: fix uninitialized parent in insert_state

I don't know how this isn't caught when we build this in the kernel, but
while syncing extent-io-tree.c into btrfs-progs I got an error because
parent could potentially be uninitialized when we link in a new node,
specifically when the extent_io_tree is empty.  This means we could have
garbage in the parent color.  I don't know what the ramifications are of
that, but it's probably not great, so fix this by initializing parent to
NULL.  I spot checked all of our other usages in btrfs and we appear to
be doing the correct thing everywhere else.

Fixes: c7e118cf98c7 ("btrfs: open code rbtree search in insert_state")
CC: stable@vger.kernel.org # 6.0+
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-io-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index 21fa15123af8..82ca6a11e11a 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -395,7 +395,7 @@ static int insert_state(struct extent_io_tree *tree,
 			u32 bits, struct extent_changeset *changeset)
 {
 	struct rb_node **node;
-	struct rb_node *parent;
+	struct rb_node *parent = NULL;
 	const u64 end = state->end;
 
 	set_state_bits(tree, state, bits, changeset);
-- 
cgit 


From 26df39a9e5a8985674e814f0b27b25e8b4eb9ba7 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 18 Nov 2022 15:09:42 -0500
Subject: btrfs: fix uninitialized variable in find_first_clear_extent_bit

This was caught when syncing extent-io-tree.c into btrfs-progs.  This
however isn't really a problem, the only way next would be uninitialized
is if we found the range we were looking for, and in this case we don't
care about next.  However it's a compile error, so fix it up.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-io-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index 82ca6a11e11a..9ae9cd1e7035 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -1425,7 +1425,7 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
 				 u64 *start_ret, u64 *end_ret, u32 bits)
 {
 	struct extent_state *state;
-	struct extent_state *prev = NULL, *next;
+	struct extent_state *prev = NULL, *next = NULL;
 
 	spin_lock(&tree->lock);
 
-- 
cgit 


From 63d5429f68a3d4c4aa27e65a05196c17f86c41d6 Mon Sep 17 00:00:00 2001
From: Artem Chernyshev <artem.chernyshev@red-soft.ru>
Date: Sat, 19 Nov 2022 11:13:29 +0300
Subject: btrfs: replace strncpy() with strscpy()

Using strncpy() on NUL-terminated strings are deprecated.  To avoid
possible forming of non-terminated string strscpy() should be used.

Found by Linux Verification Center (linuxtesting.org) with SVACE.

CC: stable@vger.kernel.org # 4.9+
Signed-off-by: Artem Chernyshev <artem.chernyshev@red-soft.ru>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c      | 9 +++------
 fs/btrfs/rcu-string.h | 6 +++++-
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index bed74a3ff574..4fd6b61b06a4 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2859,13 +2859,10 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
 	di_args->bytes_used = btrfs_device_get_bytes_used(dev);
 	di_args->total_bytes = btrfs_device_get_total_bytes(dev);
 	memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
-	if (dev->name) {
-		strncpy(di_args->path, btrfs_dev_name(dev),
-				sizeof(di_args->path) - 1);
-		di_args->path[sizeof(di_args->path) - 1] = 0;
-	} else {
+	if (dev->name)
+		strscpy(di_args->path, btrfs_dev_name(dev), sizeof(di_args->path));
+	else
 		di_args->path[0] = '\0';
-	}
 
 out:
 	rcu_read_unlock();
diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h
index 5c1a617eb25d..5c2b66d155ef 100644
--- a/fs/btrfs/rcu-string.h
+++ b/fs/btrfs/rcu-string.h
@@ -18,7 +18,11 @@ static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask)
 					 (len * sizeof(char)), mask);
 	if (!ret)
 		return ret;
-	strncpy(ret->str, src, len);
+	/* Warn if the source got unexpectedly truncated. */
+	if (WARN_ON(strscpy(ret->str, src, len) < 0)) {
+		kfree(ret);
+		return NULL;
+	}
 	return ret;
 }
 
-- 
cgit 


From 3a8d1db341b93e65fa76051ab833b3640842b6eb Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 21 Nov 2022 10:23:23 +0000
Subject: btrfs: unify overwrite_item() and do_overwrite_item()

After commit 193df6245704 ("btrfs: search for last logged dir index if
it's not cached in the inode"), there are no more callers of
do_overwrite_item(), except overwrite_item().

Originally both used to be the same function, but were split in
commit 086dcbfa50d3 ("btrfs: insert items in batches when logging a
directory when possible"), as there was the need to execute all logic
of overwrite_item() but skip the tree search, since in the context of
directory logging we already had a path with a leaf to copy data from.

So unify them again as there is no more need to have them split.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 76 +++++++++++++++++------------------------------------
 1 file changed, 24 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4387cd2ade97..bcc8717f51fd 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -365,11 +365,25 @@ static int process_one_buffer(struct btrfs_root *log,
 	return ret;
 }
 
-static int do_overwrite_item(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *root,
-			     struct btrfs_path *path,
-			     struct extent_buffer *eb, int slot,
-			     struct btrfs_key *key)
+/*
+ * Item overwrite used by replay and tree logging.  eb, slot and key all refer
+ * to the src data we are copying out.
+ *
+ * root is the tree we are copying into, and path is a scratch
+ * path for use in this function (it should be released on entry and
+ * will be released on exit).
+ *
+ * If the key is already in the destination tree the existing item is
+ * overwritten.  If the existing item isn't big enough, it is extended.
+ * If it is too large, it is truncated.
+ *
+ * If the key isn't in the destination yet, a new item is inserted.
+ */
+static int overwrite_item(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  struct btrfs_path *path,
+			  struct extent_buffer *eb, int slot,
+			  struct btrfs_key *key)
 {
 	int ret;
 	u32 item_size;
@@ -386,22 +400,10 @@ static int do_overwrite_item(struct btrfs_trans_handle *trans,
 	item_size = btrfs_item_size(eb, slot);
 	src_ptr = btrfs_item_ptr_offset(eb, slot);
 
-	/* Our caller must have done a search for the key for us. */
-	ASSERT(path->nodes[0] != NULL);
-
-	/*
-	 * And the slot must point to the exact key or the slot where the key
-	 * should be at (the first item with a key greater than 'key')
-	 */
-	if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
-		struct btrfs_key found_key;
-
-		btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
-		ret = btrfs_comp_cpu_keys(&found_key, key);
-		ASSERT(ret >= 0);
-	} else {
-		ret = 1;
-	}
+	/* Look for the key in the destination tree. */
+	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+	if (ret < 0)
+		return ret;
 
 	if (ret == 0) {
 		char *src_copy;
@@ -579,36 +581,6 @@ no_copy:
 	return 0;
 }
 
-/*
- * Item overwrite used by replay and tree logging.  eb, slot and key all refer
- * to the src data we are copying out.
- *
- * root is the tree we are copying into, and path is a scratch
- * path for use in this function (it should be released on entry and
- * will be released on exit).
- *
- * If the key is already in the destination tree the existing item is
- * overwritten.  If the existing item isn't big enough, it is extended.
- * If it is too large, it is truncated.
- *
- * If the key isn't in the destination yet, a new item is inserted.
- */
-static int overwrite_item(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root,
-			  struct btrfs_path *path,
-			  struct extent_buffer *eb, int slot,
-			  struct btrfs_key *key)
-{
-	int ret;
-
-	/* Look for the key in the destination tree. */
-	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
-	if (ret < 0)
-		return ret;
-
-	return do_overwrite_item(trans, root, path, eb, slot, key);
-}
-
 static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len,
 			       struct fscrypt_str *name)
 {
@@ -5406,7 +5378,7 @@ struct btrfs_dir_list {
  *    has a size that doesn't match the sum of the lengths of all the logged
  *    names - this is ok, not a problem, because at log replay time we set the
  *    directory's i_size to the correct value (see replay_one_name() and
- *    do_overwrite_item()).
+ *    overwrite_item()).
  */
 static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
 				struct btrfs_inode *start_inode,
-- 
cgit 


From 3eb423442483d454937cdc8853e58c399ffe5514 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 21 Nov 2022 10:23:24 +0000
Subject: btrfs: remove outdated logic from overwrite_item() and add assertion

As of commit 193df6245704 ("btrfs: search for last logged dir index if
it's not cached in the inode"), the overwrite_item() function is always
called for a root that is from a fs/subvolume tree. In other words, now
it's only used during log replay to modify a fs/subvolume tree. Therefore
we can remove the logic that checks if we are dealing with a log tree at
overwrite_item().

So remove that logic, replacing it with an assertion and document that if
we ever need to support a log root there, we will need to clone the leaf
from the fs/subvolume tree and then release it before modifying the log
tree, which is needed to avoid a potential deadlock, similar to the one
recently fixed by a patch with the subject:

  "btrfs: do not modify log tree while holding a leaf from fs tree locked"

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index bcc8717f51fd..a3c43f0b1c95 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -391,11 +391,16 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
 	int save_old_i_size = 0;
 	unsigned long src_ptr;
 	unsigned long dst_ptr;
-	int overwrite_root = 0;
 	bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
 
-	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
-		overwrite_root = 1;
+	/*
+	 * This is only used during log replay, so the root is always from a
+	 * fs/subvolume tree. In case we ever need to support a log root, then
+	 * we'll have to clone the leaf in the path, release the path and use
+	 * the leaf before writing into the log tree. See the comments at
+	 * copy_items() for more details.
+	 */
+	ASSERT(root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID);
 
 	item_size = btrfs_item_size(eb, slot);
 	src_ptr = btrfs_item_ptr_offset(eb, slot);
@@ -548,8 +553,7 @@ insert:
 			goto no_copy;
 		}
 
-		if (overwrite_root &&
-		    S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
+		if (S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
 		    S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
 			save_old_i_size = 1;
 			saved_i_size = btrfs_inode_size(path->nodes[0],
-- 
cgit 


From 1742e1c90c3da344f3bb9b1f1309b3f47482756a Mon Sep 17 00:00:00 2001
From: void0red <void0red@gmail.com>
Date: Wed, 23 Nov 2022 22:39:45 +0800
Subject: btrfs: fix extent map use-after-free when handling missing device in
 read_one_chunk

Store the error code before freeing the extent_map. Though it's
reference counted structure, in that function it's the first and last
allocation so this would lead to a potential use-after-free.

The error can happen eg. when chunk is stored on a missing device and
the degraded mount option is missing.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=216721
Reported-by: eriri <1527030098@qq.com>
Fixes: adfb69af7d8c ("btrfs: add_missing_dev() should return the actual error")
CC: stable@vger.kernel.org # 4.9+
Signed-off-by: void0red <void0red@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index acab20f2863d..aa25fa335d3e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6976,8 +6976,9 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
 			map->stripes[i].dev = handle_missing_device(fs_info,
 								    devid, uuid);
 			if (IS_ERR(map->stripes[i].dev)) {
+				ret = PTR_ERR(map->stripes[i].dev);
 				free_extent_map(em);
-				return PTR_ERR(map->stripes[i].dev);
+				return ret;
 			}
 		}
 
-- 
cgit 


From 162d053e15fe985f754ef495a96eb3db970c43ed Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 28 Nov 2022 15:07:30 +0000
Subject: btrfs: do not BUG_ON() on ENOMEM when dropping extent items for a
 range

If we get -ENOMEM while dropping file extent items in a given range, at
btrfs_drop_extents(), due to failure to allocate memory when attempting to
increment the reference count for an extent or drop the reference count,
we handle it with a BUG_ON(). This is excessive, instead we can simply
abort the transaction and return the error to the caller. In fact most
callers of btrfs_drop_extents(), directly or indirectly, already abort
the transaction if btrfs_drop_extents() returns any error.

Also, we already have error paths at btrfs_drop_extents() that may return
-ENOMEM and in those cases we abort the transaction, like for example
anything that changes the b+tree may return -ENOMEM due to a failure to
allocate a new extent buffer when COWing an existing extent buffer, such
as a call to btrfs_duplicate_item() for example.

So replace the BUG_ON() calls with proper logic to abort the transaction
and return the error.

Reported-by: syzbot+0b1fb6b0108c27419f9f@syzkaller.appspotmail.com
Link: https://lore.kernel.org/linux-btrfs/00000000000089773e05ee4b9cb4@google.com/
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 448b143a5cb2..91b00eb2440e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -380,7 +380,10 @@ next_slot:
 						args->start - extent_offset,
 						0, false);
 				ret = btrfs_inc_extent_ref(trans, &ref);
-				BUG_ON(ret); /* -ENOMEM */
+				if (ret) {
+					btrfs_abort_transaction(trans, ret);
+					break;
+				}
 			}
 			key.offset = args->start;
 		}
@@ -467,7 +470,10 @@ delete_extent_item:
 						key.offset - extent_offset, 0,
 						false);
 				ret = btrfs_free_extent(trans, &ref);
-				BUG_ON(ret); /* -ENOMEM */
+				if (ret) {
+					btrfs_abort_transaction(trans, ret);
+					break;
+				}
 				args->bytes_found += extent_end - key.offset;
 			}
 
-- 
cgit 


From b7af0635c87ff78d6bd523298ab7471f9ffd3ce5 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 30 Nov 2022 15:51:20 +0000
Subject: btrfs: print transaction aborted messages with an error level

Currently we print the transaction aborted message with a debug level, but
a transaction abort is an exceptional event that indicates something went
wrong and it's useful to have it printed with an error level as it helps
analysing problems in a production environment, where debug level messages
are typically not logged. For example reports from syzbot never include
the transaction aborted message, since the log level on the test machines
is above the debug level.

So change the log level from debug to error.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/messages.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h
index 5ac410e313e4..190af1f698d9 100644
--- a/fs/btrfs/messages.h
+++ b/fs/btrfs/messages.h
@@ -197,13 +197,13 @@ do {								\
 			      &((trans)->fs_info->fs_state))) {	\
 		first = true;					\
 		if (WARN(abort_should_print_stack(errno),       \
-			KERN_DEBUG				\
+			KERN_ERR				\
 			"BTRFS: Transaction aborted (error %d)\n",	\
 			(errno))) {					\
 			/* Stack trace printed. */			\
 		} else {						\
-			btrfs_debug((trans)->fs_info,			\
-				    "Transaction aborted (error %d)", \
+			btrfs_err((trans)->fs_info,			\
+				  "Transaction aborted (error %d)",     \
 				  (errno));			\
 		}						\
 	}							\
-- 
cgit 


From d01c6ed6db38a4db2921591e4f5425bee1931aca Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 5 Dec 2022 14:27:05 -0500
Subject: NFS4.x/pnfs: Fix up logging of layout stateids

If the layout is invalid, then just log a '0' value.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4trace.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 2cff5901c689..e3fbdc8a98eb 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -1815,7 +1815,7 @@ TRACE_EVENT(pnfs_update_layout,
 			__entry->count = count;
 			__entry->iomode = iomode;
 			__entry->reason = reason;
-			if (lo != NULL) {
+			if (lo != NULL && pnfs_layout_is_valid(lo)) {
 				__entry->layoutstateid_seq =
 				be32_to_cpu(lo->plh_stateid.seqid);
 				__entry->layoutstateid_hash =
@@ -1869,7 +1869,7 @@ DECLARE_EVENT_CLASS(pnfs_layout_event,
 			__entry->pos = pos;
 			__entry->count = count;
 			__entry->iomode = iomode;
-			if (lo != NULL) {
+			if (lo != NULL && pnfs_layout_is_valid(lo)) {
 				__entry->layoutstateid_seq =
 				be32_to_cpu(lo->plh_stateid.seqid);
 				__entry->layoutstateid_hash =
-- 
cgit 


From e6b842741b4f39007215fd7e545cb55aa3d358a2 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <swboyd@chromium.org>
Date: Mon, 5 Dec 2022 15:31:36 -0800
Subject: pstore: Avoid kcore oops by vmap()ing with VM_IOREMAP

An oops can be induced by running 'cat /proc/kcore > /dev/null' on
devices using pstore with the ram backend because kmap_atomic() assumes
lowmem pages are accessible with __va().

 Unable to handle kernel paging request at virtual address ffffff807ff2b000
 Mem abort info:
 ESR = 0x96000006
 EC = 0x25: DABT (current EL), IL = 32 bits
 SET = 0, FnV = 0
 EA = 0, S1PTW = 0
 FSC = 0x06: level 2 translation fault
 Data abort info:
 ISV = 0, ISS = 0x00000006
 CM = 0, WnR = 0
 swapper pgtable: 4k pages, 39-bit VAs, pgdp=0000000081d87000
 [ffffff807ff2b000] pgd=180000017fe18003, p4d=180000017fe18003, pud=180000017fe18003, pmd=0000000000000000
 Internal error: Oops: 96000006 [#1] PREEMPT SMP
 Modules linked in: dm_integrity
 CPU: 7 PID: 21179 Comm: perf Not tainted 5.15.67-10882-ge4eb2eb988cd #1 baa443fb8e8477896a370b31a821eb2009f9bfba
 Hardware name: Google Lazor (rev3 - 8) (DT)
 pstate: a0400009 (NzCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
 pc : __memcpy+0x110/0x260
 lr : vread+0x194/0x294
 sp : ffffffc013ee39d0
 x29: ffffffc013ee39f0 x28: 0000000000001000 x27: ffffff807ff2b000
 x26: 0000000000001000 x25: ffffffc0085a2000 x24: ffffff802d4b3000
 x23: ffffff80f8a60000 x22: ffffff802d4b3000 x21: ffffffc0085a2000
 x20: ffffff8080b7bc68 x19: 0000000000001000 x18: 0000000000000000
 x17: 0000000000000000 x16: 0000000000000000 x15: ffffffd3073f2e60
 x14: ffffffffad588000 x13: 0000000000000000 x12: 0000000000000001
 x11: 00000000000001a2 x10: 00680000fff2bf0b x9 : 03fffffff807ff2b
 x8 : 0000000000000001 x7 : 0000000000000000 x6 : 0000000000000000
 x5 : ffffff802d4b4000 x4 : ffffff807ff2c000 x3 : ffffffc013ee3a78
 x2 : 0000000000001000 x1 : ffffff807ff2b000 x0 : ffffff802d4b3000
 Call trace:
 __memcpy+0x110/0x260
 read_kcore+0x584/0x778
 proc_reg_read+0xb4/0xe4

During early boot, memblock reserves the pages for the ramoops reserved
memory node in DT that would otherwise be part of the direct lowmem
mapping. Pstore's ram backend reuses those reserved pages to change the
memory type (writeback or non-cached) by passing the pages to vmap()
(see pfn_to_page() usage in persistent_ram_vmap() for more details) with
specific flags. When read_kcore() starts iterating over the vmalloc
region, it runs over the virtual address that vmap() returned for
ramoops. In aligned_vread() the virtual address is passed to
vmalloc_to_page() which returns the page struct for the reserved lowmem
area. That lowmem page is passed to kmap_atomic(), which effectively
calls page_to_virt() that assumes a lowmem page struct must be directly
accessible with __va() and friends. These pages are mapped via vmap()
though, and the lowmem mapping was never made, so accessing them via the
lowmem virtual address oopses like above.

Let's side-step this problem by passing VM_IOREMAP to vmap(). This will
tell vread() to not include the ramoops region in the kcore. Instead the
area will look like a bunch of zeros. The alternative is to teach kmap()
about vmalloc areas that intersect with lowmem. Presumably such a change
isn't a one-liner, and there isn't much interest in inspecting the
ramoops region in kcore files anyway, so the most expedient route is
taken for now.

Cc: Brian Geffon <bgeffon@google.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Fixes: 404a6043385d ("staging: android: persistent_ram: handle reserving and mapping memory")
Signed-off-by: Stephen Boyd <swboyd@chromium.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20221205233136.3420802-1-swboyd@chromium.org
---
 fs/pstore/ram_core.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index 97dde525150a..966191d3a5ba 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -440,7 +440,11 @@ static void *persistent_ram_vmap(phys_addr_t start, size_t size,
 		phys_addr_t addr = page_start + i * PAGE_SIZE;
 		pages[i] = pfn_to_page(addr >> PAGE_SHIFT);
 	}
-	vaddr = vmap(pages, page_count, VM_MAP, prot);
+	/*
+	 * VM_IOREMAP used here to bypass this region during vread()
+	 * and kmap_atomic() (i.e. kcore) to avoid __va() failures.
+	 */
+	vaddr = vmap(pages, page_count, VM_MAP | VM_IOREMAP, prot);
 	kfree(pages);
 
 	/*
-- 
cgit 


From 6f1c1d95dc93b52a8ef9cc1f3f610c2d5e6b217b Mon Sep 17 00:00:00 2001
From: ChenXiaoSong <chenxiaosong2@huawei.com>
Date: Fri, 23 Sep 2022 13:40:14 +0800
Subject: NFS: make sure open context mode have FMODE_EXEC when file open for
 exec

Because file f_mode never have FMODE_EXEC, open context mode won't get
FMODE_EXEC from file f_mode. Open context mode only care about FMODE_READ/
FMODE_WRITE/FMODE_EXEC, and all info about open context mode can be convert
from file f_flags, so convert file f_flags to open context mode by
flags_to_mode().

Signed-off-by: ChenXiaoSong <chenxiaosong2@huawei.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/inode.c    |  3 ++-
 fs/nfs/nfs4file.c | 12 ++++--------
 2 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 6b2cfa59a1a2..e98ee7599eeb 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1168,7 +1168,8 @@ int nfs_open(struct inode *inode, struct file *filp)
 {
 	struct nfs_open_context *ctx;
 
-	ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode, filp);
+	ctx = alloc_nfs_open_context(file_dentry(filp),
+				     flags_to_mode(filp->f_flags), filp);
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
 	nfs_file_set_open_context(filp, ctx);
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 9eb181287879..2563ed8580f3 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -32,7 +32,6 @@ nfs4_file_open(struct inode *inode, struct file *filp)
 	struct dentry *parent = NULL;
 	struct inode *dir;
 	unsigned openflags = filp->f_flags;
-	fmode_t f_mode;
 	struct iattr attr;
 	int err;
 
@@ -51,17 +50,14 @@ nfs4_file_open(struct inode *inode, struct file *filp)
 	if (err)
 		return err;
 
-	f_mode = filp->f_mode;
-	if ((openflags & O_ACCMODE) == 3)
-		f_mode |= flags_to_mode(openflags);
-
 	/* We can't create new files here */
 	openflags &= ~(O_CREAT|O_EXCL);
 
 	parent = dget_parent(dentry);
 	dir = d_inode(parent);
 
-	ctx = alloc_nfs_open_context(file_dentry(filp), f_mode, filp);
+	ctx = alloc_nfs_open_context(file_dentry(filp),
+				     flags_to_mode(openflags), filp);
 	err = PTR_ERR(ctx);
 	if (IS_ERR(ctx))
 		goto out;
@@ -366,8 +362,8 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt,
 		goto out_free_name;
 	}
 
-	ctx = alloc_nfs_open_context(filep->f_path.dentry, filep->f_mode,
-					filep);
+	ctx = alloc_nfs_open_context(filep->f_path.dentry,
+				     flags_to_mode(filep->f_flags), filep);
 	if (IS_ERR(ctx)) {
 		res = ERR_CAST(ctx);
 		goto out_filep;
-- 
cgit 


From d564d2c4c2445cb0972453933dc87c2dcaac8597 Mon Sep 17 00:00:00 2001
From: ChenXiaoSong <chenxiaosong2@huawei.com>
Date: Fri, 23 Sep 2022 13:40:15 +0800
Subject: NFSv4: check FMODE_EXEC from open context mode in
 nfs4_opendata_access()

After converting file f_flags to open context mode by flags_to_mode(), open
context mode will have FMODE_EXEC when file open for exec, so we check
FMODE_EXEC from open context mode.

No functional change, just simplify the code.

Signed-off-by: ChenXiaoSong <chenxiaosong2@huawei.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4proc.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e51044a5f550..44f1dcb6020f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2630,8 +2630,7 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data)
  */
 static int nfs4_opendata_access(const struct cred *cred,
 				struct nfs4_opendata *opendata,
-				struct nfs4_state *state, fmode_t fmode,
-				int openflags)
+				struct nfs4_state *state, fmode_t fmode)
 {
 	struct nfs_access_entry cache;
 	u32 mask, flags;
@@ -2642,11 +2641,7 @@ static int nfs4_opendata_access(const struct cred *cred,
 		return 0;
 
 	mask = 0;
-	/*
-	 * Use openflags to check for exec, because fmode won't
-	 * always have FMODE_EXEC set when file open for exec.
-	 */
-	if (openflags & __FMODE_EXEC) {
+	if (fmode & FMODE_EXEC) {
 		/* ONLY check for exec rights */
 		if (S_ISDIR(state->inode->i_mode))
 			mask = NFS4_ACCESS_LOOKUP;
@@ -3034,7 +3029,7 @@ static unsigned nfs4_exclusive_attrset(struct nfs4_opendata *opendata,
 }
 
 static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
-		int flags, struct nfs_open_context *ctx)
+		struct nfs_open_context *ctx)
 {
 	struct nfs4_state_owner *sp = opendata->owner;
 	struct nfs_server *server = sp->so_server;
@@ -3095,8 +3090,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
 	/* Parse layoutget results before we check for access */
 	pnfs_parse_lgopen(state->inode, opendata->lgp, ctx);
 
-	ret = nfs4_opendata_access(sp->so_cred, opendata, state,
-			acc_mode, flags);
+	ret = nfs4_opendata_access(sp->so_cred, opendata, state, acc_mode);
 	if (ret != 0)
 		goto out;
 
@@ -3170,7 +3164,7 @@ static int _nfs4_do_open(struct inode *dir,
 	if (d_really_is_positive(dentry))
 		opendata->state = nfs4_get_open_state(d_inode(dentry), sp);
 
-	status = _nfs4_open_and_get_state(opendata, flags, ctx);
+	status = _nfs4_open_and_get_state(opendata, ctx);
 	if (status != 0)
 		goto err_opendata_put;
 	state = ctx->state;
-- 
cgit 


From 5559405df652008e56eee88872126fe4c451da67 Mon Sep 17 00:00:00 2001
From: Hawkins Jiawei <yin31149@gmail.com>
Date: Mon, 24 Oct 2022 00:39:45 +0800
Subject: nfs: fix possible null-ptr-deref when parsing param

According to commit "vfs: parse: deal with zero length string value",
kernel will set the param->string to null pointer in vfs_parse_fs_string()
if fs string has zero length.

Yet the problem is that, nfs_fs_context_parse_param() will dereferences the
param->string, without checking whether it is a null pointer, which may
trigger a null-ptr-deref bug.

This patch solves it by adding sanity check on param->string
in nfs_fs_context_parse_param().

Signed-off-by: Hawkins Jiawei <yin31149@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/fs_context.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index 09833ec102fc..9bcd53d5c7d4 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -684,6 +684,8 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
 			return ret;
 		break;
 	case Opt_vers:
+		if (!param->string)
+			goto out_invalid_value;
 		trace_nfs_mount_assign(param->key, param->string);
 		ret = nfs_parse_version_string(fc, param->string);
 		if (ret < 0)
@@ -696,6 +698,8 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
 		break;
 
 	case Opt_proto:
+		if (!param->string)
+			goto out_invalid_value;
 		trace_nfs_mount_assign(param->key, param->string);
 		protofamily = AF_INET;
 		switch (lookup_constant(nfs_xprt_protocol_tokens, param->string, -1)) {
@@ -732,6 +736,8 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
 		break;
 
 	case Opt_mountproto:
+		if (!param->string)
+			goto out_invalid_value;
 		trace_nfs_mount_assign(param->key, param->string);
 		mountfamily = AF_INET;
 		switch (lookup_constant(nfs_xprt_protocol_tokens, param->string, -1)) {
-- 
cgit 


From ef8d98f20dfc79777d2b66a30926533225dc6efa Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 10 Oct 2022 10:26:51 +1100
Subject: NFS: avoid spurious warning of lost lock that is being unlocked.

When the NFSv4 state manager recovers state after a server restart, it
reports that locks have been lost if it finds any lock state for which
recovery hasn't been successful.  i.e. any for which
NFS_LOCK_INITIALIZED is not set.

However it only tries to recover locks that are still linked to
inode->i_flctx.  So if a lock has been removed from inode->i_flctx, but
the state for that lock has not yet been destroyed, then a spurious
warning results.

nfs4_proc_unlck() calls locks_lock_inode_wait() - which removes the lock
from ->i_flctx - before sending the unlock request to the server and
before the final nfs4_put_lock_state() is called.  This allows a window
in which a spurious warning can be produced.

So add a new flag NFS_LOCK_UNLOCKING which is set once the decision has
been made to unlock the lock.  This will prevent it from triggering any
warning.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4_fs.h   | 1 +
 fs/nfs/nfs4proc.c  | 3 ++-
 fs/nfs/nfs4state.c | 3 ++-
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index cfef738d765e..5edd1704f735 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -149,6 +149,7 @@ struct nfs4_lock_state {
 	struct nfs4_state *	ls_state;	/* Pointer to open state */
 #define NFS_LOCK_INITIALIZED 0
 #define NFS_LOCK_LOST        1
+#define NFS_LOCK_UNLOCKING   2
 	unsigned long		ls_flags;
 	struct nfs_seqid_counter	ls_seqid;
 	nfs4_stateid		ls_stateid;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 44f1dcb6020f..40d749f29ed3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -7023,12 +7023,13 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
 		mutex_unlock(&sp->so_delegreturn_mutex);
 		goto out;
 	}
+	lsp = request->fl_u.nfs4_fl.owner;
+	set_bit(NFS_LOCK_UNLOCKING, &lsp->ls_flags);
 	up_read(&nfsi->rwsem);
 	mutex_unlock(&sp->so_delegreturn_mutex);
 	if (status != 0)
 		goto out;
 	/* Is this a delegated lock? */
-	lsp = request->fl_u.nfs4_fl.owner;
 	if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) == 0)
 		goto out;
 	alloc_seqid = NFS_SERVER(inode)->nfs_client->cl_mvops->alloc_seqid;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index a2d2d5d1b088..7c1f43507813 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1619,7 +1619,8 @@ static int __nfs4_reclaim_open_state(struct nfs4_state_owner *sp, struct nfs4_st
 		spin_lock(&state->state_lock);
 		list_for_each_entry(lock, &state->lock_states, ls_locks) {
 			trace_nfs4_state_lock_reclaim(state, lock);
-			if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags))
+			if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags) &&
+			    !test_bit(NFS_LOCK_UNLOCKING, &lock->ls_flags))
 				*lost_locks += 1;
 		}
 		spin_unlock(&state->state_lock);
-- 
cgit 


From 36357fe74ef736524a29fbd3952948768510a8b9 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@Netapp.com>
Date: Wed, 30 Nov 2022 13:15:25 -0500
Subject: NFSv4.2: Set the correct size scratch buffer for decoding READ_PLUS

The scratch_buf array is 16 bytes, but I was passing 32 to the
xdr_set_scratch_buffer() function. Fix this by using sizeof(), which is
what I probably should have been doing this whole time.

Fixes: d3b00a802c84 ("NFS: Replace the READ_PLUS decoding code")
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs42xdr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index fe1aeb0f048f..2fd465cab631 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -1142,7 +1142,7 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res)
 	if (!segs)
 		return -ENOMEM;
 
-	xdr_set_scratch_buffer(xdr, &scratch_buf, 32);
+	xdr_set_scratch_buffer(xdr, &scratch_buf, sizeof(scratch_buf));
 	status = -EIO;
 	for (i = 0; i < segments; i++) {
 		status = decode_read_plus_segment(xdr, &segs[i]);
-- 
cgit 


From f8527028a7e52da884055c401abc04e0b0c84285 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@Netapp.com>
Date: Wed, 30 Nov 2022 13:15:26 -0500
Subject: NFSv4.2: Fix up READ_PLUS alignment

Assume that the first segment will be a DATA segment, and place the data
directly into the xdr pages so it doesn't need to be shifted.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs42xdr.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 2fd465cab631..d80ee88ca996 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -47,13 +47,14 @@
 #define decode_deallocate_maxsz		(op_decode_hdr_maxsz)
 #define encode_read_plus_maxsz		(op_encode_hdr_maxsz + \
 					 encode_stateid_maxsz + 3)
-#define NFS42_READ_PLUS_SEGMENT_SIZE	(1 /* data_content4 */ + \
+#define NFS42_READ_PLUS_DATA_SEGMENT_SIZE \
+					(1 /* data_content4 */ + \
 					 2 /* data_info4.di_offset */ + \
-					 2 /* data_info4.di_length */)
+					 1 /* data_info4.di_length */)
 #define decode_read_plus_maxsz		(op_decode_hdr_maxsz + \
 					 1 /* rpr_eof */ + \
 					 1 /* rpr_contents count */ + \
-					 2 * NFS42_READ_PLUS_SEGMENT_SIZE)
+					 NFS42_READ_PLUS_DATA_SEGMENT_SIZE)
 #define encode_seek_maxsz		(op_encode_hdr_maxsz + \
 					 encode_stateid_maxsz + \
 					 2 /* offset */ + \
-- 
cgit 


From a60214c2465493aac0b014d87ee19327b6204c42 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@Netapp.com>
Date: Wed, 30 Nov 2022 15:30:47 -0500
Subject: NFS: Allow very small rsize & wsize again

940261a19508 introduced nfs_io_size() to clamp the iosize to a multiple
of PAGE_SIZE. This had the unintended side effect of no longer allowing
iosizes less than a page, which could be useful in some situations.

UDP already has an exception that causes it to fall back on the
power-of-two style sizes instead. This patch adds an additional
exception for very small iosizes.

Reported-by: Jeff Layton <jlayton@kernel.org>
Fixes: 940261a19508 ("NFS: Allow setting rsize / wsize to a multiple of PAGE_SIZE")
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/internal.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 647fc3f547cb..ae7d4a8c728c 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -739,12 +739,10 @@ unsigned long nfs_io_size(unsigned long iosize, enum xprt_transports proto)
 		iosize = NFS_DEF_FILE_IO_SIZE;
 	else if (iosize >= NFS_MAX_FILE_IO_SIZE)
 		iosize = NFS_MAX_FILE_IO_SIZE;
-	else
-		iosize = iosize & PAGE_MASK;
 
-	if (proto == XPRT_TRANSPORT_UDP)
+	if (proto == XPRT_TRANSPORT_UDP || iosize < PAGE_SIZE)
 		return nfs_block_bits(iosize, NULL);
-	return iosize;
+	return iosize & PAGE_MASK;
 }
 
 /*
-- 
cgit 


From 700fa9b1b361760ca6c28e2e7d6cccbdc9fb6716 Mon Sep 17 00:00:00 2001
From: ye xingchen <ye.xingchen@zte.com.cn>
Date: Mon, 5 Dec 2022 17:13:45 +0800
Subject: NFS: use sysfs_emit() to instead of scnprintf()

Follow the advice of the Documentation/filesystems/sysfs.rst and show()
should only use sysfs_emit() or sysfs_emit_at() when formatting the
value to be returned to user space.

Signed-off-by: ye xingchen <ye.xingchen@zte.com.cn>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/namespace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 88a23af2bd5c..b0ef7e7ddb30 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -354,7 +354,7 @@ static int param_get_nfs_timeout(char *buffer, const struct kernel_param *kp)
 			num = (num + (HZ - 1)) / HZ;
 	} else
 		num = -1;
-	return scnprintf(buffer, PAGE_SIZE, "%li\n", num);
+	return sysfs_emit(buffer, "%li\n", num);
 }
 
 static const struct kernel_param_ops param_ops_nfs_timeout = {
-- 
cgit 


From 19cdc8fa5b9b768f5753eefe3da32fcbafcb0d18 Mon Sep 17 00:00:00 2001
From: ye xingchen <ye.xingchen@zte.com.cn>
Date: Mon, 5 Dec 2022 17:19:16 +0800
Subject: fs: nfs: sysfs: use sysfs_emit() to instead of scnprintf()

Follow the advice of the Documentation/filesystems/sysfs.rst and show()
should only use sysfs_emit() or sysfs_emit_at() when formatting the
value to be returned to user space.

Signed-off-by: ye xingchen <ye.xingchen@zte.com.cn>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/sysfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c
index a6f740366963..26f8ece2a997 100644
--- a/fs/nfs/sysfs.c
+++ b/fs/nfs/sysfs.c
@@ -82,7 +82,7 @@ static ssize_t nfs_netns_identifier_show(struct kobject *kobj,
 	ssize_t ret;
 
 	rcu_read_lock();
-	ret = scnprintf(buf, PAGE_SIZE, "%s\n", rcu_dereference(c->identifier));
+	ret = sysfs_emit(buf, "%s\n", rcu_dereference(c->identifier));
 	rcu_read_unlock();
 	return ret;
 }
-- 
cgit 


From b4e4f66901658fae0614dea5bf91062a5387eda7 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 6 Dec 2022 12:42:59 -0500
Subject: NFSv4.x: Fail client initialisation if state manager thread can't run

If the state manager thread fails to start, then we should just mark the
client initialisation as failed so that other processes or threads don't
get stuck in nfs_wait_client_init_complete().

Reported-by: ChenXiaoSong <chenxiaosong2@huawei.com>
Fixes: 4697bd5e9419 ("NFSv4: Fix a race in the net namespace mount notification")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4state.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 7c1f43507813..5720196141e1 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1230,6 +1230,8 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
 	if (IS_ERR(task)) {
 		printk(KERN_ERR "%s: kthread_run: %ld\n",
 			__func__, PTR_ERR(task));
+		if (!nfs_client_init_is_complete(clp))
+			nfs_mark_client_ready(clp, PTR_ERR(task));
 		nfs4_clear_state_manager_bit(clp);
 		clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
 		nfs_put_client(clp);
-- 
cgit 


From ce529cc25b184e93397b94a8a322128fc0095cbb Mon Sep 17 00:00:00 2001
From: Jingbo Xu <jefflexu@linux.alibaba.com>
Date: Wed, 30 Nov 2022 14:04:55 +0800
Subject: erofs: enable large folios for iomap mode

Enable large folios for iomap mode.  Then the readahead routine will
pass down large folios containing multiple pages.

Let's enable this for non-compressed format for now, until the
compression part supports large folios later.

When large folios supported, the iomap routine will allocate iomap_page
for each large folio and thus we need iomap_release_folio() and
iomap_invalidate_folio() to free iomap_page when these folios get
reclaimed or invalidated.

Signed-off-by: Jingbo Xu <jefflexu@linux.alibaba.com>
Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Link: https://lore.kernel.org/r/20221130060455.44532-1-jefflexu@linux.alibaba.com
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/erofs/data.c  | 2 ++
 fs/erofs/inode.c | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index fe8ac0e163f7..c9526c627dda 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -403,6 +403,8 @@ const struct address_space_operations erofs_raw_access_aops = {
 	.readahead = erofs_readahead,
 	.bmap = erofs_bmap,
 	.direct_IO = noop_direct_IO,
+	.release_folio = iomap_release_folio,
+	.invalidate_folio = iomap_invalidate_folio,
 };
 
 #ifdef CONFIG_FS_DAX
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index ad2a82f2eb4c..e457b8a59ee7 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -295,6 +295,8 @@ static int erofs_fill_inode(struct inode *inode)
 		goto out_unlock;
 	}
 	inode->i_mapping->a_ops = &erofs_raw_access_aops;
+	if (!erofs_is_fscache_mode(inode->i_sb))
+		mapping_set_large_folios(inode->i_mapping);
 #ifdef CONFIG_EROFS_FS_ONDEMAND
 	if (erofs_is_fscache_mode(inode->i_sb))
 		inode->i_mapping->a_ops = &erofs_fscache_access_aops;
-- 
cgit 


From 27f2a2dcc6261406b509b5022a1e5c23bf622830 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 25 Nov 2022 19:08:22 +0800
Subject: erofs: check the uniqueness of fsid in shared domain in advance

When shared domain is enabled, doing mount twice with the same fsid and
domain_id will trigger sysfs warning as shown below:

 sysfs: cannot create duplicate filename '/fs/erofs/d0,meta.bin'
 CPU: 15 PID: 1051 Comm: mount Not tainted 6.1.0-rc6+ #1
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996)
 Call Trace:
  <TASK>
  dump_stack_lvl+0x38/0x49
  dump_stack+0x10/0x12
  sysfs_warn_dup.cold+0x17/0x27
  sysfs_create_dir_ns+0xb8/0xd0
  kobject_add_internal+0xb1/0x240
  kobject_init_and_add+0x71/0xa0
  erofs_register_sysfs+0x89/0x110
  erofs_fc_fill_super+0x98c/0xaf0
  vfs_get_super+0x7d/0x100
  get_tree_nodev+0x16/0x20
  erofs_fc_get_tree+0x20/0x30
  vfs_get_tree+0x24/0xb0
  path_mount+0x2fa/0xa90
  do_mount+0x7c/0xa0
  __x64_sys_mount+0x8b/0xe0
  do_syscall_64+0x30/0x60
  entry_SYSCALL_64_after_hwframe+0x46/0xb0

The reason is erofs_fscache_register_cookie() doesn't guarantee the primary
data blob (aka fsid) is unique in the shared domain and
erofs_register_sysfs() invoked by the second mount will fail due to the
duplicated fsid in the shared domain and report warning.

It would be better to check the uniqueness of fsid before doing
erofs_register_sysfs(), so adding a new flags parameter for
erofs_fscache_register_cookie() and doing the uniqueness check if
EROFS_REG_COOKIE_NEED_NOEXIST is enabled.

After the patch, the error in dmesg for the duplicated mount would be:

 erofs: ...: erofs_domain_register_cookie: XX already exists in domain YY

Reviewed-by: Jia Zhu <zhujia.zj@bytedance.com>
Reviewed-by: Jingbo Xu <jefflexu@linux.alibaba.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20221125110822.3812942-1-houtao@huaweicloud.com
Fixes: 7d41963759fe ("erofs: Support sharing cookies in the same domain")
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/erofs/fscache.c  | 47 +++++++++++++++++++++++++++++++++++------------
 fs/erofs/internal.h | 10 ++++++++--
 fs/erofs/super.c    |  2 +-
 3 files changed, 44 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index af5ed6b9c54d..6a792a513d6b 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -494,7 +494,8 @@ static int erofs_fscache_register_domain(struct super_block *sb)
 
 static
 struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb,
-						    char *name, bool need_inode)
+						   char *name,
+						   unsigned int flags)
 {
 	struct fscache_volume *volume = EROFS_SB(sb)->volume;
 	struct erofs_fscache *ctx;
@@ -516,7 +517,7 @@ struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb,
 	fscache_use_cookie(cookie, false);
 	ctx->cookie = cookie;
 
-	if (need_inode) {
+	if (flags & EROFS_REG_COOKIE_NEED_INODE) {
 		struct inode *const inode = new_inode(sb);
 
 		if (!inode) {
@@ -554,14 +555,15 @@ static void erofs_fscache_relinquish_cookie(struct erofs_fscache *ctx)
 
 static
 struct erofs_fscache *erofs_fscache_domain_init_cookie(struct super_block *sb,
-		char *name, bool need_inode)
+						       char *name,
+						       unsigned int flags)
 {
 	int err;
 	struct inode *inode;
 	struct erofs_fscache *ctx;
 	struct erofs_domain *domain = EROFS_SB(sb)->domain;
 
-	ctx = erofs_fscache_acquire_cookie(sb, name, need_inode);
+	ctx = erofs_fscache_acquire_cookie(sb, name, flags);
 	if (IS_ERR(ctx))
 		return ctx;
 
@@ -589,7 +591,8 @@ out:
 
 static
 struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb,
-						   char *name, bool need_inode)
+						   char *name,
+						   unsigned int flags)
 {
 	struct inode *inode;
 	struct erofs_fscache *ctx;
@@ -602,23 +605,30 @@ struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb,
 		ctx = inode->i_private;
 		if (!ctx || ctx->domain != domain || strcmp(ctx->name, name))
 			continue;
-		igrab(inode);
+		if (!(flags & EROFS_REG_COOKIE_NEED_NOEXIST)) {
+			igrab(inode);
+		} else {
+			erofs_err(sb, "%s already exists in domain %s", name,
+				  domain->domain_id);
+			ctx = ERR_PTR(-EEXIST);
+		}
 		spin_unlock(&psb->s_inode_list_lock);
 		mutex_unlock(&erofs_domain_cookies_lock);
 		return ctx;
 	}
 	spin_unlock(&psb->s_inode_list_lock);
-	ctx = erofs_fscache_domain_init_cookie(sb, name, need_inode);
+	ctx = erofs_fscache_domain_init_cookie(sb, name, flags);
 	mutex_unlock(&erofs_domain_cookies_lock);
 	return ctx;
 }
 
 struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
-						    char *name, bool need_inode)
+						    char *name,
+						    unsigned int flags)
 {
 	if (EROFS_SB(sb)->domain_id)
-		return erofs_domain_register_cookie(sb, name, need_inode);
-	return erofs_fscache_acquire_cookie(sb, name, need_inode);
+		return erofs_domain_register_cookie(sb, name, flags);
+	return erofs_fscache_acquire_cookie(sb, name, flags);
 }
 
 void erofs_fscache_unregister_cookie(struct erofs_fscache *ctx)
@@ -647,6 +657,7 @@ int erofs_fscache_register_fs(struct super_block *sb)
 	int ret;
 	struct erofs_sb_info *sbi = EROFS_SB(sb);
 	struct erofs_fscache *fscache;
+	unsigned int flags;
 
 	if (sbi->domain_id)
 		ret = erofs_fscache_register_domain(sb);
@@ -655,8 +666,20 @@ int erofs_fscache_register_fs(struct super_block *sb)
 	if (ret)
 		return ret;
 
-	/* acquired domain/volume will be relinquished in kill_sb() on error */
-	fscache = erofs_fscache_register_cookie(sb, sbi->fsid, true);
+	/*
+	 * When shared domain is enabled, using NEED_NOEXIST to guarantee
+	 * the primary data blob (aka fsid) is unique in the shared domain.
+	 *
+	 * For non-shared-domain case, fscache_acquire_volume() invoked by
+	 * erofs_fscache_register_volume() has already guaranteed
+	 * the uniqueness of primary data blob.
+	 *
+	 * Acquired domain/volume will be relinquished in kill_sb() on error.
+	 */
+	flags = EROFS_REG_COOKIE_NEED_INODE;
+	if (sbi->domain_id)
+		flags |= EROFS_REG_COOKIE_NEED_NOEXIST;
+	fscache = erofs_fscache_register_cookie(sb, sbi->fsid, flags);
 	if (IS_ERR(fscache))
 		return PTR_ERR(fscache);
 
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 05dc68627722..e51f27b6bde1 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -604,13 +604,18 @@ static inline int z_erofs_load_lzma_config(struct super_block *sb,
 }
 #endif	/* !CONFIG_EROFS_FS_ZIP */
 
+/* flags for erofs_fscache_register_cookie() */
+#define EROFS_REG_COOKIE_NEED_INODE	1
+#define EROFS_REG_COOKIE_NEED_NOEXIST	2
+
 /* fscache.c */
 #ifdef CONFIG_EROFS_FS_ONDEMAND
 int erofs_fscache_register_fs(struct super_block *sb);
 void erofs_fscache_unregister_fs(struct super_block *sb);
 
 struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
-						     char *name, bool need_inode);
+						    char *name,
+						    unsigned int flags);
 void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache);
 
 extern const struct address_space_operations erofs_fscache_access_aops;
@@ -623,7 +628,8 @@ static inline void erofs_fscache_unregister_fs(struct super_block *sb) {}
 
 static inline
 struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
-						     char *name, bool need_inode)
+						     char *name,
+						     unsigned int flags)
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 1c7dcca702b3..481788c24a68 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -245,7 +245,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
 	}
 
 	if (erofs_is_fscache_mode(sb)) {
-		fscache = erofs_fscache_register_cookie(sb, dif->path, false);
+		fscache = erofs_fscache_register_cookie(sb, dif->path, 0);
 		if (IS_ERR(fscache))
 			return PTR_ERR(fscache);
 		dif->fscache = fscache;
-- 
cgit 


From 1282dea37b09087b8aec59f0774572c16b52276a Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Tue, 6 Dec 2022 14:03:52 +0800
Subject: erofs: clean up cached I/O strategies

After commit 4c7e42552b3a ("erofs: remove useless cache strategy of
DELAYEDALLOC"), only one cached I/O allocation strategy is supported:

  When cached I/O is preferred, page allocation is applied without
  direct reclaim.  If allocation fails, fall back to inplace I/O.

Let's get rid of z_erofs_cache_alloctype.  No logical changes.

Reviewed-by: Yue Hu <huyue2@coolpad.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Yue Hu <huyue2@coolpad.com>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20221206060352.152830-1-xiang@kernel.org
---
 fs/erofs/zdata.c | 77 +++++++++++++++++++++++---------------------------------
 1 file changed, 31 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index b792d424d774..b66c16473273 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -175,16 +175,6 @@ static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl)
 	DBG_BUGON(1);
 }
 
-/* how to allocate cached pages for a pcluster */
-enum z_erofs_cache_alloctype {
-	DONTALLOC,	/* don't allocate any cached pages */
-	/*
-	 * try to use cached I/O if page allocation succeeds or fallback
-	 * to in-place I/O instead to avoid any direct reclaim.
-	 */
-	TRYALLOC,
-};
-
 /*
  * tagged pointer with 1-bit tag for all compressed pages
  * tag 0 - the page is just found with an extra page reference
@@ -292,12 +282,29 @@ struct z_erofs_decompress_frontend {
 	.inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \
 	.mode = Z_EROFS_PCLUSTER_FOLLOWED, .backmost = true }
 
+static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe)
+{
+	unsigned int cachestrategy = EROFS_I_SB(fe->inode)->opt.cache_strategy;
+
+	if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED)
+		return false;
+
+	if (fe->backmost)
+		return true;
+
+	if (cachestrategy >= EROFS_ZIP_CACHE_READAROUND &&
+	    fe->map.m_la < fe->headoffset)
+		return true;
+
+	return false;
+}
+
 static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
-			       enum z_erofs_cache_alloctype type,
 			       struct page **pagepool)
 {
 	struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode));
 	struct z_erofs_pcluster *pcl = fe->pcl;
+	bool shouldalloc = z_erofs_should_alloc_cache(fe);
 	bool standalone = true;
 	/*
 	 * optimistic allocation without direct reclaim since inplace I/O
@@ -326,18 +333,19 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
 		} else {
 			/* I/O is needed, no possible to decompress directly */
 			standalone = false;
-			switch (type) {
-			case TRYALLOC:
-				newpage = erofs_allocpage(pagepool, gfp);
-				if (!newpage)
-					continue;
-				set_page_private(newpage,
-						 Z_EROFS_PREALLOCATED_PAGE);
-				t = tag_compressed_page_justfound(newpage);
-				break;
-			default:        /* DONTALLOC */
+			if (!shouldalloc)
 				continue;
-			}
+
+			/*
+			 * try to use cached I/O if page allocation
+			 * succeeds or fallback to in-place I/O instead
+			 * to avoid any direct reclaim.
+			 */
+			newpage = erofs_allocpage(pagepool, gfp);
+			if (!newpage)
+				continue;
+			set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
+			t = tag_compressed_page_justfound(newpage);
 		}
 
 		if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL,
@@ -637,20 +645,6 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe)
 	return true;
 }
 
-static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe,
-				       unsigned int cachestrategy,
-				       erofs_off_t la)
-{
-	if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED)
-		return false;
-
-	if (fe->backmost)
-		return true;
-
-	return cachestrategy >= EROFS_ZIP_CACHE_READAROUND &&
-		la < fe->headoffset;
-}
-
 static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos,
 				 struct page *page, unsigned int pageofs,
 				 unsigned int len)
@@ -687,12 +681,9 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
 				struct page *page, struct page **pagepool)
 {
 	struct inode *const inode = fe->inode;
-	struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
 	struct erofs_map_blocks *const map = &fe->map;
 	const loff_t offset = page_offset(page);
 	bool tight = true, exclusive;
-
-	enum z_erofs_cache_alloctype cache_strategy;
 	unsigned int cur, end, spiltted;
 	int err = 0;
 
@@ -746,13 +737,7 @@ repeat:
 		fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
 	} else {
 		/* bind cache first when cached decompression is preferred */
-		if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy,
-					       map->m_la))
-			cache_strategy = TRYALLOC;
-		else
-			cache_strategy = DONTALLOC;
-
-		z_erofs_bind_cache(fe, cache_strategy, pagepool);
+		z_erofs_bind_cache(fe, pagepool);
 	}
 hitted:
 	/*
-- 
cgit 


From 8669247524c73e16e4d3384c4ff882e5c5d06194 Mon Sep 17 00:00:00 2001
From: Jingbo Xu <jefflexu@linux.alibaba.com>
Date: Thu, 24 Nov 2022 11:42:11 +0800
Subject: fscache,cachefiles: add prepare_ondemand_read() callback

Add prepare_ondemand_read() callback dedicated for the on-demand read
scenario, so that callers from this scenario can be decoupled from
netfs_io_subrequest.

The original cachefiles_prepare_read() is now refactored to a generic
routine accepting a parameter list instead of netfs_io_subrequest.
There's no logic change, except that the debug id of subrequest and
request is removed from trace_cachefiles_prep_read().

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Jingbo Xu <jefflexu@linux.alibaba.com>
Acked-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20221124034212.81892-2-jefflexu@linux.alibaba.com
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/cachefiles/io.c | 77 +++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 50 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index 000a28f46e59..175a25fcade8 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -385,38 +385,35 @@ static int cachefiles_write(struct netfs_cache_resources *cres,
 				  term_func, term_func_priv);
 }
 
-/*
- * Prepare a read operation, shortening it to a cached/uncached
- * boundary as appropriate.
- */
-static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *subreq,
-						      loff_t i_size)
+static inline enum netfs_io_source
+cachefiles_do_prepare_read(struct netfs_cache_resources *cres,
+			   loff_t start, size_t *_len, loff_t i_size,
+			   unsigned long *_flags, ino_t netfs_ino)
 {
 	enum cachefiles_prepare_read_trace why;
-	struct netfs_io_request *rreq = subreq->rreq;
-	struct netfs_cache_resources *cres = &rreq->cache_resources;
-	struct cachefiles_object *object;
+	struct cachefiles_object *object = NULL;
 	struct cachefiles_cache *cache;
 	struct fscache_cookie *cookie = fscache_cres_cookie(cres);
 	const struct cred *saved_cred;
 	struct file *file = cachefiles_cres_file(cres);
 	enum netfs_io_source ret = NETFS_DOWNLOAD_FROM_SERVER;
+	size_t len = *_len;
 	loff_t off, to;
 	ino_t ino = file ? file_inode(file)->i_ino : 0;
 	int rc;
 
-	_enter("%zx @%llx/%llx", subreq->len, subreq->start, i_size);
+	_enter("%zx @%llx/%llx", len, start, i_size);
 
-	if (subreq->start >= i_size) {
+	if (start >= i_size) {
 		ret = NETFS_FILL_WITH_ZEROES;
 		why = cachefiles_trace_read_after_eof;
 		goto out_no_object;
 	}
 
 	if (test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags)) {
-		__set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
+		__set_bit(NETFS_SREQ_COPY_TO_CACHE, _flags);
 		why = cachefiles_trace_read_no_data;
-		if (!test_bit(NETFS_SREQ_ONDEMAND, &subreq->flags))
+		if (!test_bit(NETFS_SREQ_ONDEMAND, _flags))
 			goto out_no_object;
 	}
 
@@ -437,7 +434,7 @@ static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *
 retry:
 	off = cachefiles_inject_read_error();
 	if (off == 0)
-		off = vfs_llseek(file, subreq->start, SEEK_DATA);
+		off = vfs_llseek(file, start, SEEK_DATA);
 	if (off < 0 && off >= (loff_t)-MAX_ERRNO) {
 		if (off == (loff_t)-ENXIO) {
 			why = cachefiles_trace_read_seek_nxio;
@@ -449,21 +446,22 @@ retry:
 		goto out;
 	}
 
-	if (off >= subreq->start + subreq->len) {
+	if (off >= start + len) {
 		why = cachefiles_trace_read_found_hole;
 		goto download_and_store;
 	}
 
-	if (off > subreq->start) {
+	if (off > start) {
 		off = round_up(off, cache->bsize);
-		subreq->len = off - subreq->start;
+		len = off - start;
+		*_len = len;
 		why = cachefiles_trace_read_found_part;
 		goto download_and_store;
 	}
 
 	to = cachefiles_inject_read_error();
 	if (to == 0)
-		to = vfs_llseek(file, subreq->start, SEEK_HOLE);
+		to = vfs_llseek(file, start, SEEK_HOLE);
 	if (to < 0 && to >= (loff_t)-MAX_ERRNO) {
 		trace_cachefiles_io_error(object, file_inode(file), to,
 					  cachefiles_trace_seek_error);
@@ -471,12 +469,13 @@ retry:
 		goto out;
 	}
 
-	if (to < subreq->start + subreq->len) {
-		if (subreq->start + subreq->len >= i_size)
+	if (to < start + len) {
+		if (start + len >= i_size)
 			to = round_up(to, cache->bsize);
 		else
 			to = round_down(to, cache->bsize);
-		subreq->len = to - subreq->start;
+		len = to - start;
+		*_len = len;
 	}
 
 	why = cachefiles_trace_read_have_data;
@@ -484,12 +483,11 @@ retry:
 	goto out;
 
 download_and_store:
-	__set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
-	if (test_bit(NETFS_SREQ_ONDEMAND, &subreq->flags)) {
-		rc = cachefiles_ondemand_read(object, subreq->start,
-					      subreq->len);
+	__set_bit(NETFS_SREQ_COPY_TO_CACHE, _flags);
+	if (test_bit(NETFS_SREQ_ONDEMAND, _flags)) {
+		rc = cachefiles_ondemand_read(object, start, len);
 		if (!rc) {
-			__clear_bit(NETFS_SREQ_ONDEMAND, &subreq->flags);
+			__clear_bit(NETFS_SREQ_ONDEMAND, _flags);
 			goto retry;
 		}
 		ret = NETFS_INVALID_READ;
@@ -497,10 +495,34 @@ download_and_store:
 out:
 	cachefiles_end_secure(cache, saved_cred);
 out_no_object:
-	trace_cachefiles_prep_read(subreq, ret, why, ino);
+	trace_cachefiles_prep_read(object, start, len, *_flags, ret, why, ino, netfs_ino);
 	return ret;
 }
 
+/*
+ * Prepare a read operation, shortening it to a cached/uncached
+ * boundary as appropriate.
+ */
+static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *subreq,
+						    loff_t i_size)
+{
+	return cachefiles_do_prepare_read(&subreq->rreq->cache_resources,
+					  subreq->start, &subreq->len, i_size,
+					  &subreq->flags, subreq->rreq->inode->i_ino);
+}
+
+/*
+ * Prepare an on-demand read operation, shortening it to a cached/uncached
+ * boundary as appropriate.
+ */
+static enum netfs_io_source
+cachefiles_prepare_ondemand_read(struct netfs_cache_resources *cres,
+				 loff_t start, size_t *_len, loff_t i_size,
+				 unsigned long *_flags, ino_t ino)
+{
+	return cachefiles_do_prepare_read(cres, start, _len, i_size, _flags, ino);
+}
+
 /*
  * Prepare for a write to occur.
  */
@@ -621,6 +643,7 @@ static const struct netfs_cache_ops cachefiles_netfs_cache_ops = {
 	.write			= cachefiles_write,
 	.prepare_read		= cachefiles_prepare_read,
 	.prepare_write		= cachefiles_prepare_write,
+	.prepare_ondemand_read	= cachefiles_prepare_ondemand_read,
 	.query_occupancy	= cachefiles_query_occupancy,
 };
 
-- 
cgit 


From 709fe09e281776b5e024fb5934c0485a866b7468 Mon Sep 17 00:00:00 2001
From: Jingbo Xu <jefflexu@linux.alibaba.com>
Date: Thu, 24 Nov 2022 11:42:12 +0800
Subject: erofs: switch to prepare_ondemand_read() in fscache mode

Switch to prepare_ondemand_read() interface and a self-contained request
completion to get rid of netfs_io_[request|subrequest].

The whole request will still be split into slices (subrequest) according
to the cache state of the backing file.  As long as one of the
subrequests fails, the whole request will be marked as failed.

Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Signed-off-by: Jingbo Xu <jefflexu@linux.alibaba.com>
Reviewed-by: Jia Zhu <zhujia.zj@bytedance.com>
Link: https://lore.kernel.org/r/20221124034212.81892-3-jefflexu@linux.alibaba.com
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/erofs/fscache.c | 261 +++++++++++++++++++----------------------------------
 1 file changed, 94 insertions(+), 167 deletions(-)

(limited to 'fs')

diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index 6a792a513d6b..3e794891cd91 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -11,257 +11,180 @@ static DEFINE_MUTEX(erofs_domain_cookies_lock);
 static LIST_HEAD(erofs_domain_list);
 static struct vfsmount *erofs_pseudo_mnt;
 
-static struct netfs_io_request *erofs_fscache_alloc_request(struct address_space *mapping,
+struct erofs_fscache_request {
+	struct netfs_cache_resources cache_resources;
+	struct address_space	*mapping;	/* The mapping being accessed */
+	loff_t			start;		/* Start position */
+	size_t			len;		/* Length of the request */
+	size_t			submitted;	/* Length of submitted */
+	short			error;		/* 0 or error that occurred */
+	refcount_t		ref;
+};
+
+static struct erofs_fscache_request *erofs_fscache_req_alloc(struct address_space *mapping,
 					     loff_t start, size_t len)
 {
-	struct netfs_io_request *rreq;
+	struct erofs_fscache_request *req;
 
-	rreq = kzalloc(sizeof(struct netfs_io_request), GFP_KERNEL);
-	if (!rreq)
+	req = kzalloc(sizeof(struct erofs_fscache_request), GFP_KERNEL);
+	if (!req)
 		return ERR_PTR(-ENOMEM);
 
-	rreq->start	= start;
-	rreq->len	= len;
-	rreq->mapping	= mapping;
-	rreq->inode	= mapping->host;
-	INIT_LIST_HEAD(&rreq->subrequests);
-	refcount_set(&rreq->ref, 1);
-	return rreq;
-}
-
-static void erofs_fscache_put_request(struct netfs_io_request *rreq)
-{
-	if (!refcount_dec_and_test(&rreq->ref))
-		return;
-	if (rreq->cache_resources.ops)
-		rreq->cache_resources.ops->end_operation(&rreq->cache_resources);
-	kfree(rreq);
-}
-
-static void erofs_fscache_put_subrequest(struct netfs_io_subrequest *subreq)
-{
-	if (!refcount_dec_and_test(&subreq->ref))
-		return;
-	erofs_fscache_put_request(subreq->rreq);
-	kfree(subreq);
-}
+	req->mapping = mapping;
+	req->start   = start;
+	req->len     = len;
+	refcount_set(&req->ref, 1);
 
-static void erofs_fscache_clear_subrequests(struct netfs_io_request *rreq)
-{
-	struct netfs_io_subrequest *subreq;
-
-	while (!list_empty(&rreq->subrequests)) {
-		subreq = list_first_entry(&rreq->subrequests,
-				struct netfs_io_subrequest, rreq_link);
-		list_del(&subreq->rreq_link);
-		erofs_fscache_put_subrequest(subreq);
-	}
+	return req;
 }
 
-static void erofs_fscache_rreq_unlock_folios(struct netfs_io_request *rreq)
+static void erofs_fscache_req_complete(struct erofs_fscache_request *req)
 {
-	struct netfs_io_subrequest *subreq;
 	struct folio *folio;
-	unsigned int iopos = 0;
-	pgoff_t start_page = rreq->start / PAGE_SIZE;
-	pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
-	bool subreq_failed = false;
+	bool failed = req->error;
+	pgoff_t start_page = req->start / PAGE_SIZE;
+	pgoff_t last_page = ((req->start + req->len) / PAGE_SIZE) - 1;
 
-	XA_STATE(xas, &rreq->mapping->i_pages, start_page);
-
-	subreq = list_first_entry(&rreq->subrequests,
-				  struct netfs_io_subrequest, rreq_link);
-	subreq_failed = (subreq->error < 0);
+	XA_STATE(xas, &req->mapping->i_pages, start_page);
 
 	rcu_read_lock();
 	xas_for_each(&xas, folio, last_page) {
-		unsigned int pgpos, pgend;
-		bool pg_failed = false;
-
 		if (xas_retry(&xas, folio))
 			continue;
-
-		pgpos = (folio_index(folio) - start_page) * PAGE_SIZE;
-		pgend = pgpos + folio_size(folio);
-
-		for (;;) {
-			if (!subreq) {
-				pg_failed = true;
-				break;
-			}
-
-			pg_failed |= subreq_failed;
-			if (pgend < iopos + subreq->len)
-				break;
-
-			iopos += subreq->len;
-			if (!list_is_last(&subreq->rreq_link,
-					  &rreq->subrequests)) {
-				subreq = list_next_entry(subreq, rreq_link);
-				subreq_failed = (subreq->error < 0);
-			} else {
-				subreq = NULL;
-				subreq_failed = false;
-			}
-			if (pgend == iopos)
-				break;
-		}
-
-		if (!pg_failed)
+		if (!failed)
 			folio_mark_uptodate(folio);
-
 		folio_unlock(folio);
 	}
 	rcu_read_unlock();
+
+	if (req->cache_resources.ops)
+		req->cache_resources.ops->end_operation(&req->cache_resources);
+
+	kfree(req);
 }
 
-static void erofs_fscache_rreq_complete(struct netfs_io_request *rreq)
+static void erofs_fscache_req_put(struct erofs_fscache_request *req)
 {
-	erofs_fscache_rreq_unlock_folios(rreq);
-	erofs_fscache_clear_subrequests(rreq);
-	erofs_fscache_put_request(rreq);
+	if (refcount_dec_and_test(&req->ref))
+		erofs_fscache_req_complete(req);
 }
 
-static void erofc_fscache_subreq_complete(void *priv,
+static void erofs_fscache_subreq_complete(void *priv,
 		ssize_t transferred_or_error, bool was_async)
 {
-	struct netfs_io_subrequest *subreq = priv;
-	struct netfs_io_request *rreq = subreq->rreq;
+	struct erofs_fscache_request *req = priv;
 
 	if (IS_ERR_VALUE(transferred_or_error))
-		subreq->error = transferred_or_error;
-
-	if (atomic_dec_and_test(&rreq->nr_outstanding))
-		erofs_fscache_rreq_complete(rreq);
-
-	erofs_fscache_put_subrequest(subreq);
+		req->error = transferred_or_error;
+	erofs_fscache_req_put(req);
 }
 
 /*
- * Read data from fscache and fill the read data into page cache described by
- * @rreq, which shall be both aligned with PAGE_SIZE. @pstart describes
- * the start physical address in the cache file.
+ * Read data from fscache (cookie, pstart, len), and fill the read data into
+ * page cache described by (req->mapping, lstart, len). @pstart describeis the
+ * start physical address in the cache file.
  */
 static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie,
-				struct netfs_io_request *rreq, loff_t pstart)
+		struct erofs_fscache_request *req, loff_t pstart, size_t len)
 {
 	enum netfs_io_source source;
-	struct super_block *sb = rreq->mapping->host->i_sb;
-	struct netfs_io_subrequest *subreq;
-	struct netfs_cache_resources *cres = &rreq->cache_resources;
+	struct super_block *sb = req->mapping->host->i_sb;
+	struct netfs_cache_resources *cres = &req->cache_resources;
 	struct iov_iter iter;
-	loff_t start = rreq->start;
-	size_t len = rreq->len;
+	loff_t lstart = req->start + req->submitted;
 	size_t done = 0;
 	int ret;
 
-	atomic_set(&rreq->nr_outstanding, 1);
+	DBG_BUGON(len > req->len - req->submitted);
 
 	ret = fscache_begin_read_operation(cres, cookie);
 	if (ret)
-		goto out;
+		return ret;
 
 	while (done < len) {
-		subreq = kzalloc(sizeof(struct netfs_io_subrequest),
-				 GFP_KERNEL);
-		if (subreq) {
-			INIT_LIST_HEAD(&subreq->rreq_link);
-			refcount_set(&subreq->ref, 2);
-			subreq->rreq = rreq;
-			refcount_inc(&rreq->ref);
-		} else {
-			ret = -ENOMEM;
-			goto out;
-		}
-
-		subreq->start = pstart + done;
-		subreq->len	=  len - done;
-		subreq->flags = 1 << NETFS_SREQ_ONDEMAND;
+		loff_t sstart = pstart + done;
+		size_t slen = len - done;
+		unsigned long flags = 1 << NETFS_SREQ_ONDEMAND;
 
-		list_add_tail(&subreq->rreq_link, &rreq->subrequests);
-
-		source = cres->ops->prepare_read(subreq, LLONG_MAX);
-		if (WARN_ON(subreq->len == 0))
+		source = cres->ops->prepare_ondemand_read(cres,
+				sstart, &slen, LLONG_MAX, &flags, 0);
+		if (WARN_ON(slen == 0))
 			source = NETFS_INVALID_READ;
 		if (source != NETFS_READ_FROM_CACHE) {
-			erofs_err(sb, "failed to fscache prepare_read (source %d)",
-				  source);
-			ret = -EIO;
-			subreq->error = ret;
-			erofs_fscache_put_subrequest(subreq);
-			goto out;
+			erofs_err(sb, "failed to fscache prepare_read (source %d)", source);
+			return -EIO;
 		}
 
-		atomic_inc(&rreq->nr_outstanding);
+		refcount_inc(&req->ref);
+		iov_iter_xarray(&iter, READ, &req->mapping->i_pages,
+				lstart + done, slen);
 
-		iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages,
-				start + done, subreq->len);
-
-		ret = fscache_read(cres, subreq->start, &iter,
-				   NETFS_READ_HOLE_FAIL,
-				   erofc_fscache_subreq_complete, subreq);
+		ret = fscache_read(cres, sstart, &iter, NETFS_READ_HOLE_FAIL,
+				   erofs_fscache_subreq_complete, req);
 		if (ret == -EIOCBQUEUED)
 			ret = 0;
 		if (ret) {
 			erofs_err(sb, "failed to fscache_read (ret %d)", ret);
-			goto out;
+			return ret;
 		}
 
-		done += subreq->len;
+		done += slen;
 	}
-out:
-	if (atomic_dec_and_test(&rreq->nr_outstanding))
-		erofs_fscache_rreq_complete(rreq);
-
-	return ret;
+	DBG_BUGON(done != len);
+	req->submitted += len;
+	return 0;
 }
 
 static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio)
 {
 	int ret;
 	struct super_block *sb = folio_mapping(folio)->host->i_sb;
-	struct netfs_io_request *rreq;
+	struct erofs_fscache_request *req;
 	struct erofs_map_dev mdev = {
 		.m_deviceid = 0,
 		.m_pa = folio_pos(folio),
 	};
 
 	ret = erofs_map_dev(sb, &mdev);
-	if (ret)
-		goto out;
+	if (ret) {
+		folio_unlock(folio);
+		return ret;
+	}
 
-	rreq = erofs_fscache_alloc_request(folio_mapping(folio),
+	req = erofs_fscache_req_alloc(folio_mapping(folio),
 				folio_pos(folio), folio_size(folio));
-	if (IS_ERR(rreq)) {
-		ret = PTR_ERR(rreq);
-		goto out;
+	if (IS_ERR(req)) {
+		folio_unlock(folio);
+		return PTR_ERR(req);
 	}
 
-	return erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
-				rreq, mdev.m_pa);
-out:
-	folio_unlock(folio);
+	ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
+				req, mdev.m_pa, folio_size(folio));
+	if (ret)
+		req->error = ret;
+
+	erofs_fscache_req_put(req);
 	return ret;
 }
 
 /*
  * Read into page cache in the range described by (@pos, @len).
  *
- * On return, the caller is responsible for page unlocking if the output @unlock
- * is true, or the callee will take this responsibility through netfs_io_request
- * interface.
+ * On return, if the output @unlock is true, the caller is responsible for page
+ * unlocking; otherwise the callee will take this responsibility through request
+ * completion.
  *
  * The return value is the number of bytes successfully handled, or negative
  * error code on failure. The only exception is that, the length of the range
- * instead of the error code is returned on failure after netfs_io_request is
- * allocated, so that .readahead() could advance rac accordingly.
+ * instead of the error code is returned on failure after request is allocated,
+ * so that .readahead() could advance rac accordingly.
  */
 static int erofs_fscache_data_read(struct address_space *mapping,
 				   loff_t pos, size_t len, bool *unlock)
 {
 	struct inode *inode = mapping->host;
 	struct super_block *sb = inode->i_sb;
-	struct netfs_io_request *rreq;
+	struct erofs_fscache_request *req;
 	struct erofs_map_blocks map;
 	struct erofs_map_dev mdev;
 	struct iov_iter iter;
@@ -318,13 +241,17 @@ static int erofs_fscache_data_read(struct address_space *mapping,
 	if (ret)
 		return ret;
 
-	rreq = erofs_fscache_alloc_request(mapping, pos, count);
-	if (IS_ERR(rreq))
-		return PTR_ERR(rreq);
+	req = erofs_fscache_req_alloc(mapping, pos, count);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	*unlock = false;
-	erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
-			rreq, mdev.m_pa + (pos - map.m_la));
+	ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
+			req, mdev.m_pa + (pos - map.m_la), count);
+	if (ret)
+		req->error = ret;
+
+	erofs_fscache_req_put(req);
 	return count;
 }
 
-- 
cgit 


From be62c5198861156d77b60babb89e70e21c73eb7b Mon Sep 17 00:00:00 2001
From: Jingbo Xu <jefflexu@linux.alibaba.com>
Date: Thu, 1 Dec 2022 15:42:55 +0800
Subject: erofs: support large folios for fscache mode

When large folios supported, one folio can be split into several slices,
each of which may be mapped to META/UNMAPPED/MAPPED, and the folio can
be unlocked as a whole only when all slices have completed.

Thus always allocate erofs_fscache_request for each .read_folio() or
.readahead(), in which case the allocated request is responsible for
unlocking folios when all slices have completed.

As described above, each folio or folio range can be mapped into several
slices, while these slices may be mapped to different cookies, and thus
each slice needs its own netfs_cache_resources.  Here we introduce
chained requests to support this, where each .read_folio() or
.readahead() calling can correspond to multiple requests.  Each request
has its own netfs_cache_resources and thus is used to access one cookie.
Among these requests, there's a primary request, with the others
pointing to the primary request.

Signed-off-by: Jingbo Xu <jefflexu@linux.alibaba.com>
Reviewed-by: Jia Zhu <zhujia.zj@bytedance.com>
Link: https://lore.kernel.org/r/20221201074256.16639-2-jefflexu@linux.alibaba.com
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/erofs/fscache.c | 148 +++++++++++++++++++++++++++++------------------------
 1 file changed, 80 insertions(+), 68 deletions(-)

(limited to 'fs')

diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index 3e794891cd91..f14886c479bd 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -12,6 +12,7 @@ static LIST_HEAD(erofs_domain_list);
 static struct vfsmount *erofs_pseudo_mnt;
 
 struct erofs_fscache_request {
+	struct erofs_fscache_request *primary;
 	struct netfs_cache_resources cache_resources;
 	struct address_space	*mapping;	/* The mapping being accessed */
 	loff_t			start;		/* Start position */
@@ -38,6 +39,26 @@ static struct erofs_fscache_request *erofs_fscache_req_alloc(struct address_spac
 	return req;
 }
 
+static struct erofs_fscache_request *erofs_fscache_req_chain(struct erofs_fscache_request *primary,
+					     size_t len)
+{
+	struct erofs_fscache_request *req;
+
+	/* use primary request for the first submission */
+	if (!primary->submitted) {
+		refcount_inc(&primary->ref);
+		return primary;
+	}
+
+	req = erofs_fscache_req_alloc(primary->mapping,
+			primary->start + primary->submitted, len);
+	if (!IS_ERR(req)) {
+		req->primary = primary;
+		refcount_inc(&primary->ref);
+	}
+	return req;
+}
+
 static void erofs_fscache_req_complete(struct erofs_fscache_request *req)
 {
 	struct folio *folio;
@@ -56,17 +77,19 @@ static void erofs_fscache_req_complete(struct erofs_fscache_request *req)
 		folio_unlock(folio);
 	}
 	rcu_read_unlock();
-
-	if (req->cache_resources.ops)
-		req->cache_resources.ops->end_operation(&req->cache_resources);
-
-	kfree(req);
 }
 
 static void erofs_fscache_req_put(struct erofs_fscache_request *req)
 {
-	if (refcount_dec_and_test(&req->ref))
-		erofs_fscache_req_complete(req);
+	if (refcount_dec_and_test(&req->ref)) {
+		if (req->cache_resources.ops)
+			req->cache_resources.ops->end_operation(&req->cache_resources);
+		if (!req->primary)
+			erofs_fscache_req_complete(req);
+		else
+			erofs_fscache_req_put(req->primary);
+		kfree(req);
+	}
 }
 
 static void erofs_fscache_subreq_complete(void *priv,
@@ -74,8 +97,12 @@ static void erofs_fscache_subreq_complete(void *priv,
 {
 	struct erofs_fscache_request *req = priv;
 
-	if (IS_ERR_VALUE(transferred_or_error))
-		req->error = transferred_or_error;
+	if (IS_ERR_VALUE(transferred_or_error)) {
+		if (req->primary)
+			req->primary->error = transferred_or_error;
+		else
+			req->error = transferred_or_error;
+	}
 	erofs_fscache_req_put(req);
 }
 
@@ -131,7 +158,6 @@ static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie,
 		done += slen;
 	}
 	DBG_BUGON(done != len);
-	req->submitted += len;
 	return 0;
 }
 
@@ -167,32 +193,19 @@ static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio)
 	return ret;
 }
 
-/*
- * Read into page cache in the range described by (@pos, @len).
- *
- * On return, if the output @unlock is true, the caller is responsible for page
- * unlocking; otherwise the callee will take this responsibility through request
- * completion.
- *
- * The return value is the number of bytes successfully handled, or negative
- * error code on failure. The only exception is that, the length of the range
- * instead of the error code is returned on failure after request is allocated,
- * so that .readahead() could advance rac accordingly.
- */
-static int erofs_fscache_data_read(struct address_space *mapping,
-				   loff_t pos, size_t len, bool *unlock)
+static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary)
 {
+	struct address_space *mapping = primary->mapping;
 	struct inode *inode = mapping->host;
 	struct super_block *sb = inode->i_sb;
 	struct erofs_fscache_request *req;
 	struct erofs_map_blocks map;
 	struct erofs_map_dev mdev;
 	struct iov_iter iter;
+	loff_t pos = primary->start + primary->submitted;
 	size_t count;
 	int ret;
 
-	*unlock = true;
-
 	map.m_la = pos;
 	ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
 	if (ret)
@@ -220,17 +233,19 @@ static int erofs_fscache_data_read(struct address_space *mapping,
 		}
 		iov_iter_zero(PAGE_SIZE - size, &iter);
 		erofs_put_metabuf(&buf);
-		return PAGE_SIZE;
+		primary->submitted += PAGE_SIZE;
+		return 0;
 	}
 
+	count = primary->len - primary->submitted;
 	if (!(map.m_flags & EROFS_MAP_MAPPED)) {
-		count = len;
 		iov_iter_xarray(&iter, READ, &mapping->i_pages, pos, count);
 		iov_iter_zero(count, &iter);
-		return count;
+		primary->submitted += count;
+		return 0;
 	}
 
-	count = min_t(size_t, map.m_llen - (pos - map.m_la), len);
+	count = min_t(size_t, map.m_llen - (pos - map.m_la), count);
 	DBG_BUGON(!count || count % PAGE_SIZE);
 
 	mdev = (struct erofs_map_dev) {
@@ -241,68 +256,65 @@ static int erofs_fscache_data_read(struct address_space *mapping,
 	if (ret)
 		return ret;
 
-	req = erofs_fscache_req_alloc(mapping, pos, count);
+	req = erofs_fscache_req_chain(primary, count);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
-	*unlock = false;
 	ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
 			req, mdev.m_pa + (pos - map.m_la), count);
-	if (ret)
-		req->error = ret;
-
 	erofs_fscache_req_put(req);
-	return count;
+	primary->submitted += count;
+	return ret;
 }
 
-static int erofs_fscache_read_folio(struct file *file, struct folio *folio)
+static int erofs_fscache_data_read(struct erofs_fscache_request *req)
 {
-	bool unlock;
 	int ret;
 
-	DBG_BUGON(folio_size(folio) != EROFS_BLKSIZ);
+	do {
+		ret = erofs_fscache_data_read_slice(req);
+		if (ret)
+			req->error = ret;
+	} while (!ret && req->submitted < req->len);
 
-	ret = erofs_fscache_data_read(folio_mapping(folio), folio_pos(folio),
-				      folio_size(folio), &unlock);
-	if (unlock) {
-		if (ret > 0)
-			folio_mark_uptodate(folio);
+	return ret;
+}
+
+static int erofs_fscache_read_folio(struct file *file, struct folio *folio)
+{
+	struct erofs_fscache_request *req;
+	int ret;
+
+	req = erofs_fscache_req_alloc(folio_mapping(folio),
+			folio_pos(folio), folio_size(folio));
+	if (IS_ERR(req)) {
 		folio_unlock(folio);
+		return PTR_ERR(req);
 	}
-	return ret < 0 ? ret : 0;
+
+	ret = erofs_fscache_data_read(req);
+	erofs_fscache_req_put(req);
+	return ret;
 }
 
 static void erofs_fscache_readahead(struct readahead_control *rac)
 {
-	struct folio *folio;
-	size_t len, done = 0;
-	loff_t start, pos;
-	bool unlock;
-	int ret, size;
+	struct erofs_fscache_request *req;
 
 	if (!readahead_count(rac))
 		return;
 
-	start = readahead_pos(rac);
-	len = readahead_length(rac);
+	req = erofs_fscache_req_alloc(rac->mapping,
+			readahead_pos(rac), readahead_length(rac));
+	if (IS_ERR(req))
+		return;
 
-	do {
-		pos = start + done;
-		ret = erofs_fscache_data_read(rac->mapping, pos,
-					      len - done, &unlock);
-		if (ret <= 0)
-			return;
+	/* The request completion will drop refs on the folios. */
+	while (readahead_folio(rac))
+		;
 
-		size = ret;
-		while (size) {
-			folio = readahead_folio(rac);
-			size -= folio_size(folio);
-			if (unlock) {
-				folio_mark_uptodate(folio);
-				folio_unlock(folio);
-			}
-		}
-	} while ((done += ret) < len);
+	erofs_fscache_data_read(req);
+	erofs_fscache_req_put(req);
 }
 
 static const struct address_space_operations erofs_fscache_meta_aops = {
-- 
cgit 


From e6687b89225ee9c817e6dcbadc873f6a4691e5c2 Mon Sep 17 00:00:00 2001
From: Jingbo Xu <jefflexu@linux.alibaba.com>
Date: Thu, 1 Dec 2022 15:42:56 +0800
Subject: erofs: enable large folios for fscache mode

Enable large folios for fscache mode.  Enable this feature for
non-compressed format for now, until the compression part supports large
folios later.

One thing worth noting is that, the feature is not enabled for the meta
data routine since meta inodes don't need large folios for now, nor do
they support readahead yet.

Also document this new feature.

Signed-off-by: Jingbo Xu <jefflexu@linux.alibaba.com>
Reviewed-by: Jia Zhu <zhujia.zj@bytedance.com>
Link: https://lore.kernel.org/r/20221201074256.16639-3-jefflexu@linux.alibaba.com
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/erofs/inode.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index e457b8a59ee7..85932086d23f 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -295,8 +295,7 @@ static int erofs_fill_inode(struct inode *inode)
 		goto out_unlock;
 	}
 	inode->i_mapping->a_ops = &erofs_raw_access_aops;
-	if (!erofs_is_fscache_mode(inode->i_sb))
-		mapping_set_large_folios(inode->i_mapping);
+	mapping_set_large_folios(inode->i_mapping);
 #ifdef CONFIG_EROFS_FS_ONDEMAND
 	if (erofs_is_fscache_mode(inode->i_sb))
 		inode->i_mapping->a_ops = &erofs_fscache_access_aops;
-- 
cgit 


From 927e5010ff5bd7446a22c511ab8643b9385ddf4d Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Tue, 18 Oct 2022 18:53:13 +0800
Subject: erofs: use kmap_local_page() only for erofs_bread()

Convert all mapped erofs_bread() users to use kmap_local_page()
instead of kmap() or kmap_atomic().

Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Reviewed-and-tested-by: Jingbo Xu <jefflexu@linux.alibaba.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Link: https://lore.kernel.org/r/20221018105313.4940-1-hsiangkao@linux.alibaba.com
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/erofs/data.c     | 8 ++------
 fs/erofs/inode.c    | 1 +
 fs/erofs/internal.h | 3 +--
 fs/erofs/xattr.c    | 8 ++++----
 fs/erofs/zmap.c     | 4 ++--
 5 files changed, 10 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index c9526c627dda..f57f921683d7 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -13,9 +13,7 @@
 void erofs_unmap_metabuf(struct erofs_buf *buf)
 {
 	if (buf->kmap_type == EROFS_KMAP)
-		kunmap(buf->page);
-	else if (buf->kmap_type == EROFS_KMAP_ATOMIC)
-		kunmap_atomic(buf->base);
+		kunmap_local(buf->base);
 	buf->base = NULL;
 	buf->kmap_type = EROFS_NO_KMAP;
 }
@@ -54,9 +52,7 @@ void *erofs_bread(struct erofs_buf *buf, struct inode *inode,
 	}
 	if (buf->kmap_type == EROFS_NO_KMAP) {
 		if (type == EROFS_KMAP)
-			buf->base = kmap(page);
-		else if (type == EROFS_KMAP_ATOMIC)
-			buf->base = kmap_atomic(page);
+			buf->base = kmap_local_page(page);
 		buf->kmap_type = type;
 	} else if (buf->kmap_type != type) {
 		DBG_BUGON(1);
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 85932086d23f..5b3a793103af 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -268,6 +268,7 @@ static int erofs_fill_inode(struct inode *inode)
 	case S_IFDIR:
 		inode->i_op = &erofs_dir_iops;
 		inode->i_fop = &erofs_dir_fops;
+		inode_nohighmem(inode);
 		break;
 	case S_IFLNK:
 		err = erofs_fill_symlink(inode, kaddr, ofs);
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index e51f27b6bde1..bb8501c0ff5b 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -255,8 +255,7 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
 
 enum erofs_kmap_type {
 	EROFS_NO_KMAP,		/* don't map the buffer */
-	EROFS_KMAP,		/* use kmap() to map the buffer */
-	EROFS_KMAP_ATOMIC,	/* use kmap_atomic() to map the buffer */
+	EROFS_KMAP,		/* use kmap_local_page() to map the buffer */
 };
 
 struct erofs_buf {
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index 8106bcb5a38d..a62fb8a3318a 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -148,7 +148,7 @@ static inline int xattr_iter_fixup(struct xattr_iter *it)
 
 	it->blkaddr += erofs_blknr(it->ofs);
 	it->kaddr = erofs_read_metabuf(&it->buf, it->sb, it->blkaddr,
-				       EROFS_KMAP_ATOMIC);
+				       EROFS_KMAP);
 	if (IS_ERR(it->kaddr))
 		return PTR_ERR(it->kaddr);
 	it->ofs = erofs_blkoff(it->ofs);
@@ -174,7 +174,7 @@ static int inline_xattr_iter_begin(struct xattr_iter *it,
 	it->ofs = erofs_blkoff(iloc(sbi, vi->nid) + inline_xattr_ofs);
 
 	it->kaddr = erofs_read_metabuf(&it->buf, inode->i_sb, it->blkaddr,
-				       EROFS_KMAP_ATOMIC);
+				       EROFS_KMAP);
 	if (IS_ERR(it->kaddr))
 		return PTR_ERR(it->kaddr);
 	return vi->xattr_isize - xattr_header_sz;
@@ -368,7 +368,7 @@ static int shared_getxattr(struct inode *inode, struct getxattr_iter *it)
 
 		it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]);
 		it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr,
-						  EROFS_KMAP_ATOMIC);
+						  EROFS_KMAP);
 		if (IS_ERR(it->it.kaddr))
 			return PTR_ERR(it->it.kaddr);
 		it->it.blkaddr = blkaddr;
@@ -580,7 +580,7 @@ static int shared_listxattr(struct listxattr_iter *it)
 
 		it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]);
 		it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr,
-						  EROFS_KMAP_ATOMIC);
+						  EROFS_KMAP);
 		if (IS_ERR(it->it.kaddr))
 			return PTR_ERR(it->it.kaddr);
 		it->it.blkaddr = blkaddr;
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 0bb66927e3d0..749a5ac943f4 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -178,7 +178,7 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
 	unsigned int advise, type;
 
 	m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb,
-				      erofs_blknr(pos), EROFS_KMAP_ATOMIC);
+				      erofs_blknr(pos), EROFS_KMAP);
 	if (IS_ERR(m->kaddr))
 		return PTR_ERR(m->kaddr);
 
@@ -416,7 +416,7 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m,
 out:
 	pos += lcn * (1 << amortizedshift);
 	m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb,
-				      erofs_blknr(pos), EROFS_KMAP_ATOMIC);
+				      erofs_blknr(pos), EROFS_KMAP);
 	if (IS_ERR(m->kaddr))
 		return PTR_ERR(m->kaddr);
 	return unpack_compacted_index(m, amortizedshift, pos, lookahead);
-- 
cgit 


From c42c0ffe81176940bd5dead474216b7198d77675 Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Mon, 5 Dec 2022 11:49:57 +0800
Subject: erofs: Fix pcluster memleak when its block address is zero

syzkaller reported a memleak:
https://syzkaller.appspot.com/bug?id=62f37ff612f0021641eda5b17f056f1668aa9aed

unreferenced object 0xffff88811009c7f8 (size 136):
  ...
  backtrace:
    [<ffffffff821db19b>] z_erofs_do_read_page+0x99b/0x1740
    [<ffffffff821dee9e>] z_erofs_readahead+0x24e/0x580
    [<ffffffff814bc0d6>] read_pages+0x86/0x3d0
    ...

syzkaller constructed a case: in z_erofs_register_pcluster(),
ztailpacking = false and map->m_pa = zero. This makes pcl->obj.index be
zero although pcl is not a inline pcluster.

Then following path adds refcount for grp, but the refcount won't be put
because pcl is inline.

z_erofs_readahead()
  z_erofs_do_read_page() # for another page
    z_erofs_collector_begin()
      erofs_find_workgroup()
        erofs_workgroup_get()

Since it's illegal for the block address of a non-inlined pcluster to
be zero, add check here to avoid registering the pcluster which would
be leaked.

Fixes: cecf864d3d76 ("erofs: support inline data decompression")
Reported-by: syzbot+6f8cd9a0155b366d227f@syzkaller.appspotmail.com
Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
Reviewed-by: Yue Hu <huyue2@coolpad.com>
Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Link: https://lore.kernel.org/r/Y42Kz6sVkf+XqJRB@debian
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/erofs/zdata.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index b66c16473273..ccf7c55d477f 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -496,7 +496,8 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
 	struct erofs_workgroup *grp;
 	int err;
 
-	if (!(map->m_flags & EROFS_MAP_ENCODED)) {
+	if (!(map->m_flags & EROFS_MAP_ENCODED) ||
+	    (!ztailpacking && !(map->m_pa >> PAGE_SHIFT))) {
 		DBG_BUGON(1);
 		return -EFSCORRUPTED;
 	}
-- 
cgit 


From d5d188b8f8b38d3d71dd05993874b4fc9284ce95 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Mon, 5 Dec 2022 23:00:49 +0800
Subject: erofs: fix missing unmap if z_erofs_get_extent_compressedlen() fails

Otherwise, meta buffers could be leaked.

Fixes: cec6e93beadf ("erofs: support parsing big pcluster compress indexes")
Reviewed-by: Yue Hu <huyue2@coolpad.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20221205150050.47784-1-hsiangkao@linux.alibaba.com
---
 fs/erofs/zmap.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 749a5ac943f4..98eff1259de4 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -694,7 +694,7 @@ static int z_erofs_do_map_blocks(struct inode *inode,
 		map->m_pa = blknr_to_addr(m.pblk);
 		err = z_erofs_get_extent_compressedlen(&m, initial_lcn);
 		if (err)
-			goto out;
+			goto unmap_out;
 	}
 
 	if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN) {
@@ -718,14 +718,12 @@ static int z_erofs_do_map_blocks(struct inode *inode,
 		if (!err)
 			map->m_flags |= EROFS_MAP_FULL_MAPPED;
 	}
+
 unmap_out:
 	erofs_unmap_metabuf(&m.map->buf);
-
-out:
 	erofs_dbg("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags 0%o",
 		  __func__, map->m_la, map->m_pa,
 		  map->m_llen, map->m_plen, map->m_flags);
-
 	return err;
 }
 
-- 
cgit 


From c505feba4c0d76084e56ec498ce819f02a7043ae Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Mon, 5 Dec 2022 23:00:50 +0800
Subject: erofs: validate the extent length for uncompressed pclusters

syzkaller reported a KASAN use-after-free:
https://syzkaller.appspot.com/bug?extid=2ae90e873e97f1faf6f2

The referenced fuzzed image actually has two issues:
 - m_pa == 0 as a non-inlined pcluster;
 - The logical length is longer than its physical length.

The first issue has already been addressed.  This patch addresses
the second issue by checking the extent length validity.

Reported-by: syzbot+2ae90e873e97f1faf6f2@syzkaller.appspotmail.com
Fixes: 02827e1796b3 ("staging: erofs: add erofs_map_blocks_iter")
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20221205150050.47784-2-hsiangkao@linux.alibaba.com
---
 fs/erofs/zmap.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 98eff1259de4..0150570c33aa 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -698,6 +698,11 @@ static int z_erofs_do_map_blocks(struct inode *inode,
 	}
 
 	if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN) {
+		if (map->m_llen > map->m_plen) {
+			DBG_BUGON(1);
+			err = -EFSCORRUPTED;
+			goto unmap_out;
+		}
 		if (vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER)
 			map->m_algorithmformat =
 				Z_EROFS_COMPRESSION_INTERLACED;
-- 
cgit 


From b5b52de3214a29911f949459a79f6640969b5487 Mon Sep 17 00:00:00 2001
From: Dave Wysochanski <dwysocha@redhat.com>
Date: Wed, 7 Dec 2022 13:49:15 +0000
Subject: fscache: Fix oops due to race with cookie_lru and use_cookie

If a cookie expires from the LRU and the LRU_DISCARD flag is set, but
the state machine has not run yet, it's possible another thread can call
fscache_use_cookie and begin to use it.

When the cookie_worker finally runs, it will see the LRU_DISCARD flag
set, transition the cookie->state to LRU_DISCARDING, which will then
withdraw the cookie.  Once the cookie is withdrawn the object is removed
the below oops will occur because the object associated with the cookie
is now NULL.

Fix the oops by clearing the LRU_DISCARD bit if another thread uses the
cookie before the cookie_worker runs.

  BUG: kernel NULL pointer dereference, address: 0000000000000008
  ...
  CPU: 31 PID: 44773 Comm: kworker/u130:1 Tainted: G     E    6.0.0-5.dneg.x86_64 #1
  Hardware name: Google Compute Engine/Google Compute Engine, BIOS Google 08/26/2022
  Workqueue: events_unbound netfs_rreq_write_to_cache_work [netfs]
  RIP: 0010:cachefiles_prepare_write+0x28/0x90 [cachefiles]
  ...
  Call Trace:
    netfs_rreq_write_to_cache_work+0x11c/0x320 [netfs]
    process_one_work+0x217/0x3e0
    worker_thread+0x4a/0x3b0
    kthread+0xd6/0x100

Fixes: 12bb21a29c19 ("fscache: Implement cookie user counting and resource pinning")
Reported-by: Daire Byrne <daire.byrne@gmail.com>
Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Daire Byrne <daire@dneg.com>
Link: https://lore.kernel.org/r/20221117115023.1350181-1-dwysocha@redhat.com/ # v1
Link: https://lore.kernel.org/r/20221117142915.1366990-1-dwysocha@redhat.com/ # v2
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fscache/cookie.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 451d8a077e12..bce2492186d0 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -605,6 +605,14 @@ again:
 			set_bit(FSCACHE_COOKIE_DO_PREP_TO_WRITE, &cookie->flags);
 			queue = true;
 		}
+		/*
+		 * We could race with cookie_lru which may set LRU_DISCARD bit
+		 * but has yet to run the cookie state machine.  If this happens
+		 * and another thread tries to use the cookie, clear LRU_DISCARD
+		 * so we don't end up withdrawing the cookie while in use.
+		 */
+		if (test_and_clear_bit(FSCACHE_COOKIE_DO_LRU_DISCARD, &cookie->flags))
+			fscache_see_cookie(cookie, fscache_cookie_see_lru_discard_clear);
 		break;
 
 	case FSCACHE_COOKIE_STATE_FAILED:
-- 
cgit 


From b3525072835b523b397d459fadd0785d9c24bbd1 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Mon, 24 Oct 2022 14:29:39 +0100
Subject: orangefs: remove variable i

Variable i is just being incremented and it's never used
anywhere else. The variable and the increment are redundant so
remove it.

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Signed-off-by: Mike Marshall <hubcap@omnibond.com>
---
 fs/orangefs/inode.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 7a8c0c6e698d..eaa35a966115 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -530,7 +530,6 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb,
 	size_t count = iov_iter_count(iter);
 	ssize_t total_count = 0;
 	ssize_t ret = -EINVAL;
-	int i = 0;
 
 	gossip_debug(GOSSIP_FILE_DEBUG,
 		"%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n",
@@ -556,7 +555,6 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb,
 	while (iov_iter_count(iter)) {
 		size_t each_count = iov_iter_count(iter);
 		size_t amt_complete;
-		i++;
 
 		/* how much to transfer in this loop iteration */
 		if (each_count > orangefs_bufmap_size_query())
-- 
cgit 


From 610defdccff7b955fe899a825990c2202153a22e Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Mon, 17 Oct 2022 22:49:37 +0100
Subject: orangefs: remove redundant assignment to variable buffer_index

The variable buffer_index is assigned a value that is never read,
it is assigned just before the function returns. The assignment is
redundant and can be removed.

Cleans up clang scan build warning:
fs/orangefs/file.c:276:3: warning: Value stored to 'buffer_index'
is never read [deadcode.DeadStores]

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Signed-off-by: Mike Marshall <hubcap@omnibond.com>
---
 fs/orangefs/file.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index 732661aa2680..167fa43b24f9 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -273,7 +273,6 @@ out:
 		gossip_debug(GOSSIP_FILE_DEBUG,
 			"%s(%pU): PUT buffer_index %d\n",
 			__func__, handle, buffer_index);
-		buffer_index = -1;
 	}
 	op_release(new_op);
 	return ret;
-- 
cgit 


From ea60a4ad0cf88b411cde6888b8c890935686ecd7 Mon Sep 17 00:00:00 2001
From: Zhang Xiaoxu <zhangxiaoxu5@huawei.com>
Date: Tue, 18 Oct 2022 12:40:04 +0800
Subject: orangefs: Fix sysfs not cleanup when dev init failed

When the dev init failed, should cleanup the sysfs, otherwise, the
module will never be loaded since can not create duplicate sysfs
directory:

  sysfs: cannot create duplicate filename '/fs/orangefs'

  CPU: 1 PID: 6549 Comm: insmod Tainted: G        W          6.0.0+ #44
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-1.fc33 04/01/2014
  Call Trace:
   <TASK>
   dump_stack_lvl+0x34/0x44
   sysfs_warn_dup.cold+0x17/0x24
   sysfs_create_dir_ns+0x16d/0x180
   kobject_add_internal+0x156/0x3a0
   kobject_init_and_add+0xcf/0x120
   orangefs_sysfs_init+0x7e/0x3a0 [orangefs]
   orangefs_init+0xfe/0x1000 [orangefs]
   do_one_initcall+0x87/0x2a0
   do_init_module+0xdf/0x320
   load_module+0x2f98/0x3330
   __do_sys_finit_module+0x113/0x1b0
   do_syscall_64+0x35/0x80
   entry_SYSCALL_64_after_hwframe+0x46/0xb0

  kobject_add_internal failed for orangefs with -EEXIST, don't try to register things with the same name in the same directory.

Fixes: 2f83ace37181 ("orangefs: put register_chrdev immediately before register_filesystem")
Signed-off-by: Zhang Xiaoxu <zhangxiaoxu5@huawei.com>
Signed-off-by: Mike Marshall <hubcap@omnibond.com>
---
 fs/orangefs/orangefs-mod.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c
index cd7297815f91..5ab741c60b7e 100644
--- a/fs/orangefs/orangefs-mod.c
+++ b/fs/orangefs/orangefs-mod.c
@@ -141,7 +141,7 @@ static int __init orangefs_init(void)
 		gossip_err("%s: could not initialize device subsystem %d!\n",
 			   __func__,
 			   ret);
-		goto cleanup_device;
+		goto cleanup_sysfs;
 	}
 
 	ret = register_filesystem(&orangefs_fs_type);
@@ -152,11 +152,11 @@ static int __init orangefs_init(void)
 		goto out;
 	}
 
-	orangefs_sysfs_exit();
-
-cleanup_device:
 	orangefs_dev_cleanup();
 
+cleanup_sysfs:
+	orangefs_sysfs_exit();
+
 sysfs_init_failed:
 	orangefs_debugfs_cleanup();
 
-- 
cgit 


From d23417a5bf3a3afc55de5442eb46e1e60458b0a1 Mon Sep 17 00:00:00 2001
From: Zhang Xiaoxu <zhangxiaoxu5@huawei.com>
Date: Tue, 18 Oct 2022 12:40:05 +0800
Subject: orangefs: Fix kmemleak in orangefs_prepare_debugfs_help_string()

When insert and remove the orangefs module, then debug_help_string will
be leaked:

  unreferenced object 0xffff8881652ba000 (size 4096):
    comm "insmod", pid 1701, jiffies 4294893639 (age 13218.530s)
    hex dump (first 32 bytes):
      43 6c 69 65 6e 74 20 44 65 62 75 67 20 4b 65 79  Client Debug Key
      77 6f 72 64 73 20 61 72 65 20 75 6e 6b 6e 6f 77  words are unknow
    backtrace:
      [<0000000004e6f8e3>] kmalloc_trace+0x27/0xa0
      [<0000000006f75d85>] orangefs_prepare_debugfs_help_string+0x5e/0x480 [orangefs]
      [<0000000091270a2a>] _sub_I_65535_1+0x57/0xf70 [crc_itu_t]
      [<000000004b1ee1a3>] do_one_initcall+0x87/0x2a0
      [<000000001d0614ae>] do_init_module+0xdf/0x320
      [<00000000efef068c>] load_module+0x2f98/0x3330
      [<000000006533b44d>] __do_sys_finit_module+0x113/0x1b0
      [<00000000a0da6f99>] do_syscall_64+0x35/0x80
      [<000000007790b19b>] entry_SYSCALL_64_after_hwframe+0x46/0xb0

When remove the module, should always free debug_help_string. Should
always free the allocated buffer when change the free_debug_help_string.

Signed-off-by: Zhang Xiaoxu <zhangxiaoxu5@huawei.com>
Signed-off-by: Mike Marshall <hubcap@omnibond.com>
---
 fs/orangefs/orangefs-debugfs.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
index 29eaa4544372..a848b6ef9599 100644
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -222,6 +222,8 @@ out:
 void orangefs_debugfs_cleanup(void)
 {
 	debugfs_remove_recursive(debug_dir);
+	kfree(debug_help_string);
+	debug_help_string = NULL;
 }
 
 /* open ORANGEFS_KMOD_DEBUG_HELP_FILE */
@@ -671,6 +673,7 @@ int orangefs_prepare_debugfs_help_string(int at_boot)
 		memset(debug_help_string, 0, DEBUG_HELP_STRING_SIZE);
 		strlcat(debug_help_string, new, string_size);
 		mutex_unlock(&orangefs_help_file_lock);
+		kfree(new);
 	}
 
 	rc = 0;
-- 
cgit 


From 1f2c0e8a587bcafad85019a2d80f158d8d41a868 Mon Sep 17 00:00:00 2001
From: Zhang Xiaoxu <zhangxiaoxu5@huawei.com>
Date: Tue, 18 Oct 2022 12:40:06 +0800
Subject: orangefs: Fix kmemleak in orangefs_sysfs_init()

When insert and remove the orangefs module, there are kobjects memory
leaked as below:

unreferenced object 0xffff88810f95af00 (size 64):
  comm "insmod", pid 783, jiffies 4294813439 (age 65.512s)
  hex dump (first 32 bytes):
    a0 83 af 01 81 88 ff ff 08 af 95 0f 81 88 ff ff  ................
    08 af 95 0f 81 88 ff ff 00 00 00 00 00 00 00 00  ................
  backtrace:
    [<0000000031ab7788>] kmalloc_trace+0x27/0xa0
    [<000000005a6e4dfe>] orangefs_sysfs_init+0x42/0x3a0
    [<00000000722645ca>] 0xffffffffa02780fe
    [<000000004232d9f7>] do_one_initcall+0x87/0x2a0
    [<0000000054f22384>] do_init_module+0xdf/0x320
    [<000000003263bdea>] load_module+0x2f98/0x3330
    [<0000000052cd4153>] __do_sys_finit_module+0x113/0x1b0
    [<00000000250ae02b>] do_syscall_64+0x35/0x80
    [<00000000f11c03c7>] entry_SYSCALL_64_after_hwframe+0x46/0xb0

unreferenced object 0xffff88810f95ae80 (size 64):
  comm "insmod", pid 783, jiffies 4294813439 (age 65.512s)
  hex dump (first 32 bytes):
    c8 90 0f 02 81 88 ff ff 88 ae 95 0f 81 88 ff ff  ................
    88 ae 95 0f 81 88 ff ff 00 00 00 00 00 00 00 00  ................
  backtrace:
    [<0000000031ab7788>] kmalloc_trace+0x27/0xa0
    [<000000001a4841fa>] orangefs_sysfs_init+0xc7/0x3a0
    [<00000000722645ca>] 0xffffffffa02780fe
    [<000000004232d9f7>] do_one_initcall+0x87/0x2a0
    [<0000000054f22384>] do_init_module+0xdf/0x320
    [<000000003263bdea>] load_module+0x2f98/0x3330
    [<0000000052cd4153>] __do_sys_finit_module+0x113/0x1b0
    [<00000000250ae02b>] do_syscall_64+0x35/0x80
    [<00000000f11c03c7>] entry_SYSCALL_64_after_hwframe+0x46/0xb0

unreferenced object 0xffff88810f95ae00 (size 64):
  comm "insmod", pid 783, jiffies 4294813440 (age 65.511s)
  hex dump (first 32 bytes):
    60 87 a1 00 81 88 ff ff 08 ae 95 0f 81 88 ff ff  `...............
    08 ae 95 0f 81 88 ff ff 00 00 00 00 00 00 00 00  ................
  backtrace:
    [<0000000031ab7788>] kmalloc_trace+0x27/0xa0
    [<000000005915e797>] orangefs_sysfs_init+0x12b/0x3a0
    [<00000000722645ca>] 0xffffffffa02780fe
    [<000000004232d9f7>] do_one_initcall+0x87/0x2a0
    [<0000000054f22384>] do_init_module+0xdf/0x320
    [<000000003263bdea>] load_module+0x2f98/0x3330
    [<0000000052cd4153>] __do_sys_finit_module+0x113/0x1b0
    [<00000000250ae02b>] do_syscall_64+0x35/0x80
    [<00000000f11c03c7>] entry_SYSCALL_64_after_hwframe+0x46/0xb0

unreferenced object 0xffff88810f95ad80 (size 64):
  comm "insmod", pid 783, jiffies 4294813440 (age 65.511s)
  hex dump (first 32 bytes):
    78 90 0f 02 81 88 ff ff 88 ad 95 0f 81 88 ff ff  x...............
    88 ad 95 0f 81 88 ff ff 00 00 00 00 00 00 00 00  ................
  backtrace:
    [<0000000031ab7788>] kmalloc_trace+0x27/0xa0
    [<000000007a14eb35>] orangefs_sysfs_init+0x1ac/0x3a0
    [<00000000722645ca>] 0xffffffffa02780fe
    [<000000004232d9f7>] do_one_initcall+0x87/0x2a0
    [<0000000054f22384>] do_init_module+0xdf/0x320
    [<000000003263bdea>] load_module+0x2f98/0x3330
    [<0000000052cd4153>] __do_sys_finit_module+0x113/0x1b0
    [<00000000250ae02b>] do_syscall_64+0x35/0x80
    [<00000000f11c03c7>] entry_SYSCALL_64_after_hwframe+0x46/0xb0

unreferenced object 0xffff88810f95ac00 (size 64):
  comm "insmod", pid 783, jiffies 4294813440 (age 65.531s)
  hex dump (first 32 bytes):
    e0 ff 67 02 81 88 ff ff 08 ac 95 0f 81 88 ff ff  ..g.............
    08 ac 95 0f 81 88 ff ff 00 00 00 00 00 00 00 00  ................
  backtrace:
    [<0000000031ab7788>] kmalloc_trace+0x27/0xa0
    [<000000001f38adcb>] orangefs_sysfs_init+0x291/0x3a0
    [<00000000722645ca>] 0xffffffffa02780fe
    [<000000004232d9f7>] do_one_initcall+0x87/0x2a0
    [<0000000054f22384>] do_init_module+0xdf/0x320
    [<000000003263bdea>] load_module+0x2f98/0x3330
    [<0000000052cd4153>] __do_sys_finit_module+0x113/0x1b0
    [<00000000250ae02b>] do_syscall_64+0x35/0x80
    [<00000000f11c03c7>] entry_SYSCALL_64_after_hwframe+0x46/0xb0

unreferenced object 0xffff88810f95ab80 (size 64):
  comm "insmod", pid 783, jiffies 4294813441 (age 65.530s)
  hex dump (first 32 bytes):
    50 bf 2f 02 81 88 ff ff 88 ab 95 0f 81 88 ff ff  P./.............
    88 ab 95 0f 81 88 ff ff 00 00 00 00 00 00 00 00  ................
  backtrace:
    [<0000000031ab7788>] kmalloc_trace+0x27/0xa0
    [<000000009cc7d95b>] orangefs_sysfs_init+0x2f5/0x3a0
    [<00000000722645ca>] 0xffffffffa02780fe
    [<000000004232d9f7>] do_one_initcall+0x87/0x2a0
    [<0000000054f22384>] do_init_module+0xdf/0x320
    [<000000003263bdea>] load_module+0x2f98/0x3330
    [<0000000052cd4153>] __do_sys_finit_module+0x113/0x1b0
    [<00000000250ae02b>] do_syscall_64+0x35/0x80
    [<00000000f11c03c7>] entry_SYSCALL_64_after_hwframe+0x46/0xb0

Should add release function for each kobject_type to free the memory.

Signed-off-by: Zhang Xiaoxu <zhangxiaoxu5@huawei.com>
Signed-off-by: Mike Marshall <hubcap@omnibond.com>
---
 fs/orangefs/orangefs-sysfs.c | 71 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 63 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c
index de80b62553bb..be4ba03a01a0 100644
--- a/fs/orangefs/orangefs-sysfs.c
+++ b/fs/orangefs/orangefs-sysfs.c
@@ -896,9 +896,18 @@ static struct attribute *orangefs_default_attrs[] = {
 };
 ATTRIBUTE_GROUPS(orangefs_default);
 
+static struct kobject *orangefs_obj;
+
+static void orangefs_obj_release(struct kobject *kobj)
+{
+	kfree(orangefs_obj);
+	orangefs_obj = NULL;
+}
+
 static struct kobj_type orangefs_ktype = {
 	.sysfs_ops = &orangefs_sysfs_ops,
 	.default_groups = orangefs_default_groups,
+	.release = orangefs_obj_release,
 };
 
 static struct orangefs_attribute acache_hard_limit_attribute =
@@ -934,9 +943,18 @@ static struct attribute *acache_orangefs_default_attrs[] = {
 };
 ATTRIBUTE_GROUPS(acache_orangefs_default);
 
+static struct kobject *acache_orangefs_obj;
+
+static void acache_orangefs_obj_release(struct kobject *kobj)
+{
+	kfree(acache_orangefs_obj);
+	acache_orangefs_obj = NULL;
+}
+
 static struct kobj_type acache_orangefs_ktype = {
 	.sysfs_ops = &orangefs_sysfs_ops,
 	.default_groups = acache_orangefs_default_groups,
+	.release = acache_orangefs_obj_release,
 };
 
 static struct orangefs_attribute capcache_hard_limit_attribute =
@@ -972,9 +990,18 @@ static struct attribute *capcache_orangefs_default_attrs[] = {
 };
 ATTRIBUTE_GROUPS(capcache_orangefs_default);
 
+static struct kobject *capcache_orangefs_obj;
+
+static void capcache_orangefs_obj_release(struct kobject *kobj)
+{
+	kfree(capcache_orangefs_obj);
+	capcache_orangefs_obj = NULL;
+}
+
 static struct kobj_type capcache_orangefs_ktype = {
 	.sysfs_ops = &orangefs_sysfs_ops,
 	.default_groups = capcache_orangefs_default_groups,
+	.release = capcache_orangefs_obj_release,
 };
 
 static struct orangefs_attribute ccache_hard_limit_attribute =
@@ -1010,9 +1037,18 @@ static struct attribute *ccache_orangefs_default_attrs[] = {
 };
 ATTRIBUTE_GROUPS(ccache_orangefs_default);
 
+static struct kobject *ccache_orangefs_obj;
+
+static void ccache_orangefs_obj_release(struct kobject *kobj)
+{
+	kfree(ccache_orangefs_obj);
+	ccache_orangefs_obj = NULL;
+}
+
 static struct kobj_type ccache_orangefs_ktype = {
 	.sysfs_ops = &orangefs_sysfs_ops,
 	.default_groups = ccache_orangefs_default_groups,
+	.release = ccache_orangefs_obj_release,
 };
 
 static struct orangefs_attribute ncache_hard_limit_attribute =
@@ -1048,9 +1084,18 @@ static struct attribute *ncache_orangefs_default_attrs[] = {
 };
 ATTRIBUTE_GROUPS(ncache_orangefs_default);
 
+static struct kobject *ncache_orangefs_obj;
+
+static void ncache_orangefs_obj_release(struct kobject *kobj)
+{
+	kfree(ncache_orangefs_obj);
+	ncache_orangefs_obj = NULL;
+}
+
 static struct kobj_type ncache_orangefs_ktype = {
 	.sysfs_ops = &orangefs_sysfs_ops,
 	.default_groups = ncache_orangefs_default_groups,
+	.release = ncache_orangefs_obj_release,
 };
 
 static struct orangefs_attribute pc_acache_attribute =
@@ -1079,9 +1124,18 @@ static struct attribute *pc_orangefs_default_attrs[] = {
 };
 ATTRIBUTE_GROUPS(pc_orangefs_default);
 
+static struct kobject *pc_orangefs_obj;
+
+static void pc_orangefs_obj_release(struct kobject *kobj)
+{
+	kfree(pc_orangefs_obj);
+	pc_orangefs_obj = NULL;
+}
+
 static struct kobj_type pc_orangefs_ktype = {
 	.sysfs_ops = &orangefs_sysfs_ops,
 	.default_groups = pc_orangefs_default_groups,
+	.release = pc_orangefs_obj_release,
 };
 
 static struct orangefs_attribute stats_reads_attribute =
@@ -1103,19 +1157,20 @@ static struct attribute *stats_orangefs_default_attrs[] = {
 };
 ATTRIBUTE_GROUPS(stats_orangefs_default);
 
+static struct kobject *stats_orangefs_obj;
+
+static void stats_orangefs_obj_release(struct kobject *kobj)
+{
+	kfree(stats_orangefs_obj);
+	stats_orangefs_obj = NULL;
+}
+
 static struct kobj_type stats_orangefs_ktype = {
 	.sysfs_ops = &orangefs_sysfs_ops,
 	.default_groups = stats_orangefs_default_groups,
+	.release = stats_orangefs_obj_release,
 };
 
-static struct kobject *orangefs_obj;
-static struct kobject *acache_orangefs_obj;
-static struct kobject *capcache_orangefs_obj;
-static struct kobject *ccache_orangefs_obj;
-static struct kobject *ncache_orangefs_obj;
-static struct kobject *pc_orangefs_obj;
-static struct kobject *stats_orangefs_obj;
-
 int orangefs_sysfs_init(void)
 {
 	int rc = -EINVAL;
-- 
cgit 


From 31720a2b109b3080eb77e97b8f6f50a27b4ae599 Mon Sep 17 00:00:00 2001
From: Zhang Xiaoxu <zhangxiaoxu5@huawei.com>
Date: Tue, 18 Oct 2022 12:40:07 +0800
Subject: orangefs: Fix kmemleak in orangefs_{kernel,client}_debug_init()

When insert and remove the orangefs module, there are memory leaked
as below:

unreferenced object 0xffff88816b0cc000 (size 2048):
  comm "insmod", pid 783, jiffies 4294813439 (age 65.512s)
  hex dump (first 32 bytes):
    6e 6f 6e 65 0a 00 00 00 00 00 00 00 00 00 00 00  none............
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  backtrace:
    [<0000000031ab7788>] kmalloc_trace+0x27/0xa0
    [<000000005b405fee>] orangefs_debugfs_init.cold+0xaf/0x17f
    [<00000000e5a0085b>] 0xffffffffa02780f9
    [<000000004232d9f7>] do_one_initcall+0x87/0x2a0
    [<0000000054f22384>] do_init_module+0xdf/0x320
    [<000000003263bdea>] load_module+0x2f98/0x3330
    [<0000000052cd4153>] __do_sys_finit_module+0x113/0x1b0
    [<00000000250ae02b>] do_syscall_64+0x35/0x80
    [<00000000f11c03c7>] entry_SYSCALL_64_after_hwframe+0x46/0xb0

Use the golbal variable as the buffer rather than dynamic allocate to
slove the problem.

Signed-off-by: Zhang Xiaoxu <zhangxiaoxu5@huawei.com>
Signed-off-by: Mike Marshall <hubcap@omnibond.com>
---
 fs/orangefs/orangefs-debugfs.c | 26 +++-----------------------
 1 file changed, 3 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
index a848b6ef9599..1b508f543384 100644
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -194,15 +194,10 @@ void orangefs_debugfs_init(int debug_mask)
  */
 static void orangefs_kernel_debug_init(void)
 {
-	int rc = -ENOMEM;
-	char *k_buffer = NULL;
+	static char k_buffer[ORANGEFS_MAX_DEBUG_STRING_LEN] = { };
 
 	gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__);
 
-	k_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
-	if (!k_buffer)
-		goto out;
-
 	if (strlen(kernel_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) {
 		strcpy(k_buffer, kernel_debug_string);
 		strcat(k_buffer, "\n");
@@ -213,9 +208,6 @@ static void orangefs_kernel_debug_init(void)
 
 	debugfs_create_file(ORANGEFS_KMOD_DEBUG_FILE, 0444, debug_dir, k_buffer,
 			    &kernel_debug_fops);
-
-out:
-	gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc);
 }
 
 
@@ -299,18 +291,13 @@ static int help_show(struct seq_file *m, void *v)
 /*
  * initialize the client-debug file.
  */
-static int orangefs_client_debug_init(void)
+static void orangefs_client_debug_init(void)
 {
 
-	int rc = -ENOMEM;
-	char *c_buffer = NULL;
+	static char c_buffer[ORANGEFS_MAX_DEBUG_STRING_LEN] = { };
 
 	gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__);
 
-	c_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
-	if (!c_buffer)
-		goto out;
-
 	if (strlen(client_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) {
 		strcpy(c_buffer, client_debug_string);
 		strcat(c_buffer, "\n");
@@ -324,13 +311,6 @@ static int orangefs_client_debug_init(void)
 						  debug_dir,
 						  c_buffer,
 						  &kernel_debug_fops);
-
-	rc = 0;
-
-out:
-
-	gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc);
-	return rc;
 }
 
 /* open ORANGEFS_KMOD_DEBUG_FILE or ORANGEFS_CLIENT_DEBUG_FILE.*/
-- 
cgit 


From cf8aa9bf97cadf85745506c6a3e244b22c268d63 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Sat, 24 Sep 2022 00:33:15 -0700
Subject: ovl: Use "buf" flexible array for memcpy() destination

The "buf" flexible array needs to be the memcpy() destination to avoid
false positive run-time warning from the recent FORTIFY_SOURCE
hardening:

  memcpy: detected field-spanning write (size 93) of single field "&fh->fb"
  at fs/overlayfs/export.c:799 (size 21)

Reported-by: syzbot+9d14351a171d0d1c7955@syzkaller.appspotmail.com
Link: https://lore.kernel.org/all/000000000000763a6c05e95a5985@google.com/
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/overlayfs/export.c    | 2 +-
 fs/overlayfs/overlayfs.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index e065a5b9a442..ac9c3ad04016 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -796,7 +796,7 @@ static struct ovl_fh *ovl_fid_to_fh(struct fid *fid, int buflen, int fh_type)
 		return ERR_PTR(-ENOMEM);
 
 	/* Copy unaligned inner fh into aligned buffer */
-	memcpy(&fh->fb, fid, buflen - OVL_FH_WIRE_OFFSET);
+	memcpy(fh->buf, fid, buflen - OVL_FH_WIRE_OFFSET);
 	return fh;
 }
 
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index eee8f08d32b6..e74a610a117e 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -108,7 +108,7 @@ struct ovl_fh {
 	u8 padding[3];	/* make sure fb.fid is 32bit aligned */
 	union {
 		struct ovl_fb fb;
-		u8 buf[0];
+		DECLARE_FLEX_ARRAY(u8, buf);
 	};
 } __packed;
 
-- 
cgit 


From 5b0db51215e895a361bc63132caa7cca36a53d6a Mon Sep 17 00:00:00 2001
From: Zhang Tianci <zhangtianci.1997@bytedance.com>
Date: Thu, 1 Sep 2022 16:29:29 +0800
Subject: ovl: Use ovl mounter's fsuid and fsgid in ovl_link()

There is a wrong case of link() on overlay:
  $ mkdir /lower /fuse /merge
  $ mount -t fuse /fuse
  $ mkdir /fuse/upper /fuse/work
  $ mount -t overlay /merge -o lowerdir=/lower,upperdir=/fuse/upper,\
    workdir=work
  $ touch /merge/file
  $ chown bin.bin /merge/file // the file's caller becomes "bin"
  $ ln /merge/file /merge/lnkfile

Then we will get an error(EACCES) because fuse daemon checks the link()'s
caller is "bin", it denied this request.

In the changing history of ovl_link(), there are two key commits:

The first is commit bb0d2b8ad296 ("ovl: fix sgid on directory") which
overrides the cred's fsuid/fsgid using the new inode. The new inode's
owner is initialized by inode_init_owner(), and inode->fsuid is
assigned to the current user. So the override fsuid becomes the
current user. We know link() is actually modifying the directory, so
the caller must have the MAY_WRITE permission on the directory. The
current caller may should have this permission. This is acceptable
to use the caller's fsuid.

The second is commit 51f7e52dc943 ("ovl: share inode for hard link")
which removed the inode creation in ovl_link(). This commit move
inode_init_owner() into ovl_create_object(), so the ovl_link() just
give the old inode to ovl_create_or_link(). Then the override fsuid
becomes the old inode's fsuid, neither the caller nor the overlay's
mounter! So this is incorrect.

Fix this bug by using ovl mounter's fsuid/fsgid to do underlying
fs's link().

Link: https://lore.kernel.org/all/20220817102952.xnvesg3a7rbv576x@wittgenstein/T
Link: https://lore.kernel.org/lkml/20220825130552.29587-1-zhangtianci.1997@bytedance.com/t
Signed-off-by: Zhang Tianci <zhangtianci.1997@bytedance.com>
Signed-off-by: Jiachen Zhang <zhangjiachen.jaycee@bytedance.com>
Reviewed-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Fixes: 51f7e52dc943 ("ovl: share inode for hard link")
Cc: <stable@vger.kernel.org> # v4.8
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/overlayfs/dir.c | 46 ++++++++++++++++++++++++++++++----------------
 1 file changed, 30 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 6b03457f72bb..c3032cef391e 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -592,28 +592,42 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
 			goto out_revert_creds;
 	}
 
-	err = -ENOMEM;
-	override_cred = prepare_creds();
-	if (override_cred) {
+	if (!attr->hardlink) {
+		err = -ENOMEM;
+		override_cred = prepare_creds();
+		if (!override_cred)
+			goto out_revert_creds;
+		/*
+		 * In the creation cases(create, mkdir, mknod, symlink),
+		 * ovl should transfer current's fs{u,g}id to underlying
+		 * fs. Because underlying fs want to initialize its new
+		 * inode owner using current's fs{u,g}id. And in this
+		 * case, the @inode is a new inode that is initialized
+		 * in inode_init_owner() to current's fs{u,g}id. So use
+		 * the inode's i_{u,g}id to override the cred's fs{u,g}id.
+		 *
+		 * But in the other hardlink case, ovl_link() does not
+		 * create a new inode, so just use the ovl mounter's
+		 * fs{u,g}id.
+		 */
 		override_cred->fsuid = inode->i_uid;
 		override_cred->fsgid = inode->i_gid;
-		if (!attr->hardlink) {
-			err = security_dentry_create_files_as(dentry,
-					attr->mode, &dentry->d_name, old_cred,
-					override_cred);
-			if (err) {
-				put_cred(override_cred);
-				goto out_revert_creds;
-			}
+		err = security_dentry_create_files_as(dentry,
+				attr->mode, &dentry->d_name, old_cred,
+				override_cred);
+		if (err) {
+			put_cred(override_cred);
+			goto out_revert_creds;
 		}
 		put_cred(override_creds(override_cred));
 		put_cred(override_cred);
-
-		if (!ovl_dentry_is_whiteout(dentry))
-			err = ovl_create_upper(dentry, inode, attr);
-		else
-			err = ovl_create_over_whiteout(dentry, inode, attr);
 	}
+
+	if (!ovl_dentry_is_whiteout(dentry))
+		err = ovl_create_upper(dentry, inode, attr);
+	else
+		err = ovl_create_over_whiteout(dentry, inode, attr);
+
 out_revert_creds:
 	revert_creds(old_cred);
 	return err;
-- 
cgit 


From 73db6a063c785bc3965f17569102a2a61ec10a4c Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 9 Sep 2022 11:07:47 +0200
Subject: ovl: port to vfs{g,u}id_t and associated helpers

A while ago we introduced a dedicated vfs{g,u}id_t type in commit
1e5267cd0895 ("mnt_idmapping: add vfs{g,u}id_t"). We already switched
over a good part of the VFS. Ultimately we will remove all legacy
idmapped mount helpers that operate only on k{g,u}id_t in favor of the
new type safe helpers that operate on vfs{g,u}id_t.

Cc: Seth Forshee (Digital Ocean) <sforshee@kernel.org>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/overlayfs/util.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 81a57a8d80d9..c0c20d33691b 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -1104,13 +1104,18 @@ void ovl_copyattr(struct inode *inode)
 	struct path realpath;
 	struct inode *realinode;
 	struct user_namespace *real_mnt_userns;
+	vfsuid_t vfsuid;
+	vfsgid_t vfsgid;
 
 	ovl_i_path_real(inode, &realpath);
 	realinode = d_inode(realpath.dentry);
 	real_mnt_userns = mnt_user_ns(realpath.mnt);
 
-	inode->i_uid = i_uid_into_mnt(real_mnt_userns, realinode);
-	inode->i_gid = i_gid_into_mnt(real_mnt_userns, realinode);
+	vfsuid = i_uid_into_vfsuid(real_mnt_userns, realinode);
+	vfsgid = i_gid_into_vfsgid(real_mnt_userns, realinode);
+
+	inode->i_uid = vfsuid_into_kuid(vfsuid);
+	inode->i_gid = vfsgid_into_kgid(vfsgid);
 	inode->i_mode = realinode->i_mode;
 	inode->i_atime = realinode->i_atime;
 	inode->i_mtime = realinode->i_mtime;
-- 
cgit 


From cdf5c9d1af411057e33cfbeba02ce19478dedb1e Mon Sep 17 00:00:00 2001
From: Jiangshan Yi <yijiangshan@kylinos.cn>
Date: Mon, 5 Sep 2022 13:47:36 +0800
Subject: ovl: fix comment typos

Fix two typos.

Reported-by: k2ci <kernel-bot@kylinos.cn>
Signed-off-by: Jiangshan Yi <yijiangshan@kylinos.cn>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/overlayfs/export.c | 2 +-
 fs/overlayfs/file.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index ac9c3ad04016..be866a3a92aa 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -339,7 +339,7 @@ out_iput:
 	return dentry;
 }
 
-/* Get the upper or lower dentry in stach whose on layer @idx */
+/* Get the upper or lower dentry in stack whose on layer @idx */
 static struct dentry *ovl_dentry_real_at(struct dentry *dentry, int idx)
 {
 	struct ovl_entry *oe = dentry->d_fsdata;
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index dd688a842b0b..eadbe5e0dffd 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -34,7 +34,7 @@ static char ovl_whatisit(struct inode *inode, struct inode *realinode)
 		return 'm';
 }
 
-/* No atime modificaton nor notify on underlying */
+/* No atime modification nor notify on underlying */
 #define OVL_OPEN_FLAGS (O_NOATIME | FMODE_NONOTIFY)
 
 static struct file *ovl_open_realfile(const struct file *file,
-- 
cgit 


From 8ea2876577b57805489e3044de7fcb0330c52f40 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Tue, 4 Oct 2022 13:34:32 +0300
Subject: ovl: do not reconnect upper index records in ovl_indexdir_cleanup()

ovl_indexdir_cleanup() is called on mount of overayfs with nfs_export
feature to cleanup stale index records for lower and upper files that have
been deleted while overlayfs was offline.

This has the side effect (good or bad) of pre populating inode cache with
all the copied up upper inodes, while verifying the index entries.

For copied up directories, the upper file handles are decoded to conncted
upper dentries.  This has the even bigger side effect of reading the
content of all the parent upper directories which may take significantly
more time and IO than just reading the upper inodes.

Do not request connceted upper dentries for verifying upper directory index
entries, because we have no use for the connected dentry.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/overlayfs/export.c    | 4 ++--
 fs/overlayfs/namei.c     | 7 ++++---
 fs/overlayfs/overlayfs.h | 3 ++-
 3 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index be866a3a92aa..a25bb3453dde 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -463,7 +463,7 @@ static struct dentry *ovl_lookup_real_inode(struct super_block *sb,
 
 	/* Get connected upper overlay dir from index */
 	if (index) {
-		struct dentry *upper = ovl_index_upper(ofs, index);
+		struct dentry *upper = ovl_index_upper(ofs, index, true);
 
 		dput(index);
 		if (IS_ERR_OR_NULL(upper))
@@ -739,7 +739,7 @@ static struct dentry *ovl_lower_fh_to_d(struct super_block *sb,
 
 	/* Then try to get a connected upper dir by index */
 	if (index && d_is_dir(index)) {
-		struct dentry *upper = ovl_index_upper(ofs, index);
+		struct dentry *upper = ovl_index_upper(ofs, index, true);
 
 		err = PTR_ERR(upper);
 		if (IS_ERR_OR_NULL(upper))
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 0fd1d5fdfc72..0ce1f9b3a046 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -487,7 +487,8 @@ fail:
 }
 
 /* Get upper dentry from index */
-struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index)
+struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index,
+			       bool connected)
 {
 	struct ovl_fh *fh;
 	struct dentry *upper;
@@ -499,7 +500,7 @@ struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index)
 	if (IS_ERR_OR_NULL(fh))
 		return ERR_CAST(fh);
 
-	upper = ovl_decode_real_fh(ofs, fh, ovl_upper_mnt(ofs), true);
+	upper = ovl_decode_real_fh(ofs, fh, ovl_upper_mnt(ofs), connected);
 	kfree(fh);
 
 	if (IS_ERR_OR_NULL(upper))
@@ -572,7 +573,7 @@ int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index)
 	 * directly from the index dentry, but for dir index we first need to
 	 * decode the upper directory.
 	 */
-	upper = ovl_index_upper(ofs, index);
+	upper = ovl_index_upper(ofs, index, false);
 	if (IS_ERR_OR_NULL(upper)) {
 		err = PTR_ERR(upper);
 		/*
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index e74a610a117e..70c5bd81122b 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -525,7 +525,8 @@ int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected,
 int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry,
 		      enum ovl_xattr ox, struct dentry *real, bool is_upper,
 		      bool set);
-struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index);
+struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index,
+			       bool connected);
 int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index);
 int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin,
 		       struct qstr *name);
-- 
cgit 


From af4dcb6d78b2b05a0431dfd3f67713bba8dc0900 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Tue, 4 Oct 2022 13:34:33 +0300
Subject: ovl: use plain list filler in indexdir and workdir cleanup

Those two cleanup routines are using the helper ovl_dir_read() with the
merge dir filler, which populates an rb tree, that is never used.

The index dir entry names all have a long (42 bytes) constant prefix, so it
is not surprising that perf top has demostrated high CPU usage by rb tree
population during cleanup of a large index dir:

      - 9.53% ovl_fill_merge
         - 78.41% ovl_cache_entry_find_link.constprop.27
            + 72.11% strncmp

Use the plain list filler that does not populate the unneeded rb tree.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/overlayfs/readdir.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 2b210640036c..31227e2d85e4 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -1071,14 +1071,10 @@ static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, const struct path *pa
 	int err;
 	struct inode *dir = path->dentry->d_inode;
 	LIST_HEAD(list);
-	struct rb_root root = RB_ROOT;
 	struct ovl_cache_entry *p;
 	struct ovl_readdir_data rdd = {
-		.ctx.actor = ovl_fill_merge,
-		.dentry = NULL,
+		.ctx.actor = ovl_fill_plain,
 		.list = &list,
-		.root = &root,
-		.is_lowest = false,
 	};
 	bool incompat = false;
 
@@ -1159,14 +1155,10 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs)
 	struct inode *dir = indexdir->d_inode;
 	struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = indexdir };
 	LIST_HEAD(list);
-	struct rb_root root = RB_ROOT;
 	struct ovl_cache_entry *p;
 	struct ovl_readdir_data rdd = {
-		.ctx.actor = ovl_fill_merge,
-		.dentry = NULL,
+		.ctx.actor = ovl_fill_plain,
 		.list = &list,
-		.root = &root,
-		.is_lowest = false,
 	};
 
 	err = ovl_dir_read(&path, &rdd);
-- 
cgit 


From cf4ef7801a8c880902a07712ba7ff61d8e954f85 Mon Sep 17 00:00:00 2001
From: Stanislav Goriainov <goriainov@ispras.ru>
Date: Wed, 31 Aug 2022 11:06:58 +0300
Subject: ovl: Add comment on upperredirect reassignment

If memory for uperredirect was allocated with kstrdup() in upperdir != NULL
and d.redirect != NULL path, it may seem that it can be lost when
upperredirect is reassigned later, but it's not possible.

Found by Linux Verification Center (linuxtesting.org) with SVACE.

Fixes: 0a2d0d3f2f291 ("ovl: Check redirect on index as well")
Signed-off-by: Stanislav Goriainov <goriainov@ispras.ru>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/overlayfs/namei.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 0ce1f9b3a046..46753134533a 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -1086,6 +1086,11 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
 			.mnt = ovl_upper_mnt(ofs),
 		};
 
+		/*
+		 * It's safe to assign upperredirect here: the previous
+		 * assignment of happens only if upperdentry is non-NULL, and
+		 * this one only if upperdentry is NULL.
+		 */
 		upperredirect = ovl_get_redirect_xattr(ofs, &upperpath, 0);
 		if (IS_ERR(upperredirect)) {
 			err = PTR_ERR(upperredirect);
-- 
cgit 


From 1fa9c5c5eddef0505b353031b7605c97e4257e62 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Fri, 7 Oct 2022 17:35:26 +0200
Subject: ovl: use inode instead of dentry where possible

Passing dentry to some helpers is unnecessary.  Simplify these cases.

Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/overlayfs/overlayfs.h |  6 +++---
 fs/overlayfs/readdir.c   | 46 ++++++++++++++++++++++++----------------------
 fs/overlayfs/util.c      |  6 ++----
 3 files changed, 29 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 70c5bd81122b..0b07a9ef7e5b 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -401,7 +401,7 @@ const char *ovl_dentry_get_redirect(struct dentry *dentry);
 void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect);
 void ovl_inode_update(struct inode *inode, struct dentry *upperdentry);
 void ovl_dir_modified(struct dentry *dentry, bool impurity);
-u64 ovl_dentry_version_get(struct dentry *dentry);
+u64 ovl_inode_version_get(struct inode *inode);
 bool ovl_is_whiteout(struct dentry *dentry);
 struct file *ovl_path_open(const struct path *path, int flags);
 int ovl_copy_up_start(struct dentry *dentry, int flags);
@@ -571,9 +571,9 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs);
  * lower dir was removed under it and possibly before it was rotated from upper
  * to lower layer.
  */
-static inline bool ovl_dir_is_real(struct dentry *dir)
+static inline bool ovl_dir_is_real(struct inode *dir)
 {
-	return !ovl_test_flag(OVL_WHITEOUTS, d_inode(dir));
+	return !ovl_test_flag(OVL_WHITEOUTS, dir);
 }
 
 /* inode.c */
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 31227e2d85e4..8cd2b9947de1 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -235,15 +235,15 @@ void ovl_dir_cache_free(struct inode *inode)
 	}
 }
 
-static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
+static void ovl_cache_put(struct ovl_dir_file *od, struct inode *inode)
 {
 	struct ovl_dir_cache *cache = od->cache;
 
 	WARN_ON(cache->refcount <= 0);
 	cache->refcount--;
 	if (!cache->refcount) {
-		if (ovl_dir_cache(d_inode(dentry)) == cache)
-			ovl_set_dir_cache(d_inode(dentry), NULL);
+		if (ovl_dir_cache(inode) == cache)
+			ovl_set_dir_cache(inode, NULL);
 
 		ovl_cache_free(&cache->entries);
 		kfree(cache);
@@ -323,15 +323,15 @@ static void ovl_dir_reset(struct file *file)
 {
 	struct ovl_dir_file *od = file->private_data;
 	struct ovl_dir_cache *cache = od->cache;
-	struct dentry *dentry = file->f_path.dentry;
+	struct inode *inode = file_inode(file);
 	bool is_real;
 
-	if (cache && ovl_dentry_version_get(dentry) != cache->version) {
-		ovl_cache_put(od, dentry);
+	if (cache && ovl_inode_version_get(inode) != cache->version) {
+		ovl_cache_put(od, inode);
 		od->cache = NULL;
 		od->cursor = NULL;
 	}
-	is_real = ovl_dir_is_real(dentry);
+	is_real = ovl_dir_is_real(inode);
 	if (od->is_real != is_real) {
 		/* is_real can only become false when dir is copied up */
 		if (WARN_ON(is_real))
@@ -394,9 +394,10 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
 {
 	int res;
 	struct ovl_dir_cache *cache;
+	struct inode *inode = d_inode(dentry);
 
-	cache = ovl_dir_cache(d_inode(dentry));
-	if (cache && ovl_dentry_version_get(dentry) == cache->version) {
+	cache = ovl_dir_cache(inode);
+	if (cache && ovl_inode_version_get(inode) == cache->version) {
 		WARN_ON(!cache->refcount);
 		cache->refcount++;
 		return cache;
@@ -418,8 +419,8 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
 		return ERR_PTR(res);
 	}
 
-	cache->version = ovl_dentry_version_get(dentry);
-	ovl_set_dir_cache(d_inode(dentry), cache);
+	cache->version = ovl_inode_version_get(inode);
+	ovl_set_dir_cache(inode, cache);
 
 	return cache;
 }
@@ -596,16 +597,17 @@ static struct ovl_dir_cache *ovl_cache_get_impure(const struct path *path)
 {
 	int res;
 	struct dentry *dentry = path->dentry;
+	struct inode *inode = d_inode(dentry);
 	struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
 	struct ovl_dir_cache *cache;
 
-	cache = ovl_dir_cache(d_inode(dentry));
-	if (cache && ovl_dentry_version_get(dentry) == cache->version)
+	cache = ovl_dir_cache(inode);
+	if (cache && ovl_inode_version_get(inode) == cache->version)
 		return cache;
 
 	/* Impure cache is not refcounted, free it here */
-	ovl_dir_cache_free(d_inode(dentry));
-	ovl_set_dir_cache(d_inode(dentry), NULL);
+	ovl_dir_cache_free(inode);
+	ovl_set_dir_cache(inode, NULL);
 
 	cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
 	if (!cache)
@@ -627,13 +629,13 @@ static struct ovl_dir_cache *ovl_cache_get_impure(const struct path *path)
 					OVL_XATTR_IMPURE);
 			ovl_drop_write(dentry);
 		}
-		ovl_clear_flag(OVL_IMPURE, d_inode(dentry));
+		ovl_clear_flag(OVL_IMPURE, inode);
 		kfree(cache);
 		return NULL;
 	}
 
-	cache->version = ovl_dentry_version_get(dentry);
-	ovl_set_dir_cache(d_inode(dentry), cache);
+	cache->version = ovl_inode_version_get(inode);
+	ovl_set_dir_cache(inode, cache);
 
 	return cache;
 }
@@ -675,7 +677,7 @@ static bool ovl_fill_real(struct dir_context *ctx, const char *name,
 static bool ovl_is_impure_dir(struct file *file)
 {
 	struct ovl_dir_file *od = file->private_data;
-	struct inode *dir = d_inode(file->f_path.dentry);
+	struct inode *dir = file_inode(file);
 
 	/*
 	 * Only upper dir can be impure, but if we are in the middle of
@@ -893,7 +895,7 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
 	struct file *realfile;
 	int err;
 
-	err = ovl_sync_status(OVL_FS(file->f_path.dentry->d_sb));
+	err = ovl_sync_status(OVL_FS(file_inode(file)->i_sb));
 	if (err <= 0)
 		return err;
 
@@ -913,7 +915,7 @@ static int ovl_dir_release(struct inode *inode, struct file *file)
 
 	if (od->cache) {
 		inode_lock(inode);
-		ovl_cache_put(od, file->f_path.dentry);
+		ovl_cache_put(od, inode);
 		inode_unlock(inode);
 	}
 	fput(od->realfile);
@@ -942,7 +944,7 @@ static int ovl_dir_open(struct inode *inode, struct file *file)
 		return PTR_ERR(realfile);
 	}
 	od->realfile = realfile;
-	od->is_real = ovl_dir_is_real(file->f_path.dentry);
+	od->is_real = ovl_dir_is_real(inode);
 	od->is_upper = OVL_TYPE_UPPER(type);
 	file->private_data = od;
 
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index c0c20d33691b..bde291623c8c 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -463,7 +463,7 @@ static void ovl_dir_version_inc(struct dentry *dentry, bool impurity)
 	 * which have been copied up and have origins), so only need to note
 	 * changes to impure entries.
 	 */
-	if (!ovl_dir_is_real(dentry) || impurity)
+	if (!ovl_dir_is_real(inode) || impurity)
 		OVL_I(inode)->version++;
 }
 
@@ -475,10 +475,8 @@ void ovl_dir_modified(struct dentry *dentry, bool impurity)
 	ovl_dir_version_inc(dentry, impurity);
 }
 
-u64 ovl_dentry_version_get(struct dentry *dentry)
+u64 ovl_inode_version_get(struct inode *inode)
 {
-	struct inode *inode = d_inode(dentry);
-
 	WARN_ON(!inode_is_locked(inode));
 	return OVL_I(inode)->version;
 }
-- 
cgit 


From 637d13b57d853dfc7f5743dfdfb9995c266a6031 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Fri, 7 Oct 2022 21:40:54 +0100
Subject: ovl: Kconfig: Fix spelling mistake "undelying" -> "underlying"

There is a spelling mistake in a Kconfig description. Fix it.

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/overlayfs/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig
index dd188c7996b3..6708e54b0e30 100644
--- a/fs/overlayfs/Kconfig
+++ b/fs/overlayfs/Kconfig
@@ -96,7 +96,7 @@ config OVERLAY_FS_XINO_AUTO
 	depends on 64BIT
 	help
 	  If this config option is enabled then overlay filesystems will use
-	  unused high bits in undelying filesystem inode numbers to map all
+	  unused high bits in underlying filesystem inode numbers to map all
 	  inodes to a unified address space.  The mapped 64bit inode numbers
 	  might not be compatible with applications that expect 32bit inodes.
 
-- 
cgit 


From 83fb8abec29383eb0cf35495d21669e38548771b Mon Sep 17 00:00:00 2001
From: Volker Lendecke <vl@samba.org>
Date: Fri, 25 Nov 2022 12:26:00 +0100
Subject: cifs: Add "extbuf" and "extbuflen" args to smb2_compound_op()

Will carry the variable-sized reply from SMB_FIND_FILE_POSIX_INFO

Signed-off-by: Volker Lendecke <vl@samba.org>
Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/cifs/smb2inode.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 68e08c85fbb8..1be86ba950b3 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -59,6 +59,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
 			    struct cifs_sb_info *cifs_sb, const char *full_path,
 			    __u32 desired_access, __u32 create_disposition, __u32 create_options,
 			    umode_t mode, void *ptr, int command, struct cifsFileInfo *cfile,
+			    __u8 **extbuf, size_t *extbuflen,
 			    struct kvec *err_iov, int *err_buftype)
 {
 	struct cop_vars *vars = NULL;
@@ -539,7 +540,7 @@ int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
 	cifs_get_readable_path(tcon, full_path, &cfile);
 	rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN,
 			      create_options, ACL_NO_MODE, data, SMB2_OP_QUERY_INFO, cfile,
-			      err_iov, err_buftype);
+			      NULL, NULL, err_iov, err_buftype);
 	if (rc == -EOPNOTSUPP) {
 		if (err_iov[0].iov_base && err_buftype[0] != CIFS_NO_BUFFER &&
 		    ((struct smb2_hdr *)err_iov[0].iov_base)->Command == SMB2_CREATE &&
@@ -555,7 +556,7 @@ int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
 		cifs_get_readable_path(tcon, full_path, &cfile);
 		rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES,
 				      FILE_OPEN, create_options, ACL_NO_MODE, data,
-				      SMB2_OP_QUERY_INFO, cfile, NULL, NULL);
+				      SMB2_OP_QUERY_INFO, cfile, NULL, NULL, NULL, NULL);
 	}
 
 out:
@@ -589,7 +590,7 @@ int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
 	cifs_get_readable_path(tcon, full_path, &cfile);
 	rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN,
 			      create_options, ACL_NO_MODE, data, SMB2_OP_POSIX_QUERY_INFO, cfile,
-			      err_iov, err_buftype);
+			      NULL, NULL, err_iov, err_buftype);
 	if (rc == -EOPNOTSUPP) {
 		/* BB TODO: When support for special files added to Samba re-verify this path */
 		if (err_iov[0].iov_base && err_buftype[0] != CIFS_NO_BUFFER &&
@@ -606,7 +607,7 @@ int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
 		cifs_get_readable_path(tcon, full_path, &cfile);
 		rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES,
 				      FILE_OPEN, create_options, ACL_NO_MODE, data,
-				      SMB2_OP_POSIX_QUERY_INFO, cfile, NULL, NULL);
+				      SMB2_OP_POSIX_QUERY_INFO, cfile, NULL, NULL, NULL, NULL);
 	}
 
 out:
@@ -624,7 +625,7 @@ smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode,
 	return smb2_compound_op(xid, tcon, cifs_sb, name,
 				FILE_WRITE_ATTRIBUTES, FILE_CREATE,
 				CREATE_NOT_FILE, mode, NULL, SMB2_OP_MKDIR,
-				NULL, NULL, NULL);
+				NULL, NULL, NULL, NULL, NULL);
 }
 
 void
@@ -646,7 +647,7 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name,
 	tmprc = smb2_compound_op(xid, tcon, cifs_sb, name,
 				 FILE_WRITE_ATTRIBUTES, FILE_CREATE,
 				 CREATE_NOT_FILE, ACL_NO_MODE,
-				 &data, SMB2_OP_SET_INFO, cfile, NULL, NULL);
+				 &data, SMB2_OP_SET_INFO, cfile, NULL, NULL, NULL, NULL);
 	if (tmprc == 0)
 		cifs_i->cifsAttrs = dosattrs;
 }
@@ -658,7 +659,7 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
 	drop_cached_dir_by_name(xid, tcon, name, cifs_sb);
 	return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
 				CREATE_NOT_FILE, ACL_NO_MODE,
-				NULL, SMB2_OP_RMDIR, NULL, NULL, NULL);
+				NULL, SMB2_OP_RMDIR, NULL, NULL, NULL, NULL, NULL);
 }
 
 int
@@ -667,7 +668,7 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
 {
 	return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
 				CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT,
-				ACL_NO_MODE, NULL, SMB2_OP_DELETE, NULL, NULL, NULL);
+				ACL_NO_MODE, NULL, SMB2_OP_DELETE, NULL, NULL, NULL, NULL, NULL);
 }
 
 static int
@@ -686,7 +687,7 @@ smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon,
 	}
 	rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access,
 			      FILE_OPEN, 0, ACL_NO_MODE, smb2_to_name,
-			      command, cfile, NULL, NULL);
+			      command, cfile, NULL, NULL, NULL, NULL);
 smb2_rename_path:
 	kfree(smb2_to_name);
 	return rc;
@@ -727,7 +728,7 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon,
 	cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile);
 	return smb2_compound_op(xid, tcon, cifs_sb, full_path,
 				FILE_WRITE_DATA, FILE_OPEN, 0, ACL_NO_MODE,
-				&eof, SMB2_OP_SET_EOF, cfile, NULL, NULL);
+				&eof, SMB2_OP_SET_EOF, cfile, NULL, NULL, NULL, NULL);
 }
 
 int
@@ -754,7 +755,7 @@ smb2_set_file_info(struct inode *inode, const char *full_path,
 	rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
 			      FILE_WRITE_ATTRIBUTES, FILE_OPEN,
 			      0, ACL_NO_MODE, buf, SMB2_OP_SET_INFO, cfile,
-			      NULL, NULL);
+			      NULL, NULL, NULL, NULL);
 	cifs_put_tlink(tlink);
 	return rc;
 }
-- 
cgit 


From 64ce47cb1b29d7d6aab6dcc287ae1fddb4876bd5 Mon Sep 17 00:00:00 2001
From: Volker Lendecke <vl@samba.org>
Date: Fri, 25 Nov 2022 12:37:44 +0100
Subject: cifs: Parse owner/group for stat in smb311 posix extensions

stat was returning default owner and group (unlike readdir)
for SMB3.1.1 POSIX extensions

Signed-off-by: Volker Lendecke <vl@samba.org>
Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/cifs/inode.c     | 13 +++++++++----
 fs/cifs/smb2inode.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++---
 fs/cifs/smb2proto.h |  5 ++++-
 3 files changed, 59 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 4e2ca3c6e5c0..286a5400b94e 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -632,6 +632,8 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
 
 /* Fill a cifs_fattr struct with info from POSIX info struct */
 static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_info_data *data,
+				       struct cifs_sid *owner,
+				       struct cifs_sid *group,
 				       struct super_block *sb, bool adjust_tz, bool symlink)
 {
 	struct smb311_posix_qinfo *info = &data->posix_fi;
@@ -680,8 +682,8 @@ static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct cifs_ope
 	}
 	/* else if reparse point ... TODO: add support for FIFO and blk dev; special file types */
 
-	fattr->cf_uid = cifs_sb->ctx->linux_uid; /* TODO: map uid and gid from SID */
-	fattr->cf_gid = cifs_sb->ctx->linux_gid;
+	sid_to_id(cifs_sb, owner, fattr, SIDOWNER);
+	sid_to_id(cifs_sb, group, fattr, SIDGROUP);
 
 	cifs_dbg(FYI, "POSIX query info: mode 0x%x uniqueid 0x%llx nlink %d\n",
 		fattr->cf_mode, fattr->cf_uniqueid, fattr->cf_nlink);
@@ -1175,6 +1177,7 @@ smb311_posix_get_inode_info(struct inode **inode,
 	struct cifs_fattr fattr = {0};
 	bool symlink = false;
 	struct cifs_open_info_data data = {};
+	struct cifs_sid owner, group;
 	int rc = 0;
 	int tmprc = 0;
 
@@ -1192,7 +1195,8 @@ smb311_posix_get_inode_info(struct inode **inode,
 		goto out;
 	}
 
-	rc = smb311_posix_query_path_info(xid, tcon, cifs_sb, full_path, &data, &adjust_tz,
+	rc = smb311_posix_query_path_info(xid, tcon, cifs_sb, full_path, &data,
+					  &owner, &group, &adjust_tz,
 					  &symlink);
 
 	/*
@@ -1201,7 +1205,8 @@ smb311_posix_get_inode_info(struct inode **inode,
 
 	switch (rc) {
 	case 0:
-		smb311_posix_info_to_fattr(&fattr, &data, sb, adjust_tz, symlink);
+		smb311_posix_info_to_fattr(&fattr, &data, &owner, &group,
+					   sb, adjust_tz, symlink);
 		break;
 	case -EREMOTE:
 		/* DFS link, no metadata available on this server */
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 1be86ba950b3..fbd46db1023a 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -431,6 +431,21 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
 				&rsp_iov[1], sizeof(idata->posix_fi) /* add SIDs */,
 				(char *)&idata->posix_fi);
 		}
+		if (rc == 0) {
+			unsigned int length = le32_to_cpu(qi_rsp->OutputBufferLength);
+
+			if (length > sizeof(idata->posix_fi)) {
+				char *base = (char *)rsp_iov[1].iov_base +
+					le16_to_cpu(qi_rsp->OutputBufferOffset) +
+					sizeof(idata->posix_fi);
+				*extbuflen = length - sizeof(idata->posix_fi);
+				*extbuf = kmemdup(base, *extbuflen, GFP_KERNEL);
+				if (!*extbuf)
+					rc = -ENOMEM;
+			} else {
+				rc = -EINVAL;
+			}
+		}
 		if (rqst[1].rq_iov)
 			SMB2_query_info_free(&rqst[1]);
 		if (rqst[2].rq_iov)
@@ -569,13 +584,20 @@ out:
 
 int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
 				 struct cifs_sb_info *cifs_sb, const char *full_path,
-				 struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse)
+				 struct cifs_open_info_data *data,
+				 struct cifs_sid *owner,
+				 struct cifs_sid *group,
+				 bool *adjust_tz, bool *reparse)
 {
 	int rc;
 	__u32 create_options = 0;
 	struct cifsFileInfo *cfile;
 	struct kvec err_iov[3] = {};
 	int err_buftype[3] = {};
+	__u8 *sidsbuf = NULL;
+	__u8 *sidsbuf_end = NULL;
+	size_t sidsbuflen = 0;
+	size_t owner_len, group_len;
 
 	*adjust_tz = false;
 	*reparse = false;
@@ -590,7 +612,7 @@ int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
 	cifs_get_readable_path(tcon, full_path, &cfile);
 	rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN,
 			      create_options, ACL_NO_MODE, data, SMB2_OP_POSIX_QUERY_INFO, cfile,
-			      NULL, NULL, err_iov, err_buftype);
+			      &sidsbuf, &sidsbuflen, err_iov, err_buftype);
 	if (rc == -EOPNOTSUPP) {
 		/* BB TODO: When support for special files added to Samba re-verify this path */
 		if (err_iov[0].iov_base && err_buftype[0] != CIFS_NO_BUFFER &&
@@ -607,10 +629,31 @@ int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
 		cifs_get_readable_path(tcon, full_path, &cfile);
 		rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES,
 				      FILE_OPEN, create_options, ACL_NO_MODE, data,
-				      SMB2_OP_POSIX_QUERY_INFO, cfile, NULL, NULL, NULL, NULL);
+				      SMB2_OP_POSIX_QUERY_INFO, cfile,
+				      &sidsbuf, &sidsbuflen, NULL, NULL);
+	}
+
+	if (rc == 0) {
+		sidsbuf_end = sidsbuf + sidsbuflen;
+
+		owner_len = posix_info_sid_size(sidsbuf, sidsbuf_end);
+		if (owner_len == -1) {
+			rc = -EINVAL;
+			goto out;
+		}
+		memcpy(owner, sidsbuf, owner_len);
+
+		group_len = posix_info_sid_size(
+			sidsbuf + owner_len, sidsbuf_end);
+		if (group_len == -1) {
+			rc = -EINVAL;
+			goto out;
+		}
+		memcpy(group, sidsbuf + owner_len, group_len);
 	}
 
 out:
+	kfree(sidsbuf);
 	free_rsp_buf(err_buftype[0], err_iov[0].iov_base);
 	free_rsp_buf(err_buftype[1], err_iov[1].iov_base);
 	free_rsp_buf(err_buftype[2], err_iov[2].iov_base);
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index be21b5d26f67..d5d7ffb7711c 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -277,7 +277,10 @@ extern int smb2_query_info_compound(const unsigned int xid,
 /* query path info from the server using SMB311 POSIX extensions*/
 int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
 				 struct cifs_sb_info *cifs_sb, const char *full_path,
-				 struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse);
+				 struct cifs_open_info_data *data,
+				 struct cifs_sid *owner,
+				 struct cifs_sid *group,
+				 bool *adjust_tz, bool *reparse);
 int posix_info_parse(const void *beg, const void *end,
 		     struct smb2_posix_info_parsed *out);
 int posix_info_sid_size(const void *beg, const void *end);
-- 
cgit 


From 9381666e289852f93be9d7f4f7844017e04f6315 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 16 Nov 2022 14:18:33 +0100
Subject: cifs: wire up >migrate_folio

CIFS does not use page private data that needs migration, so it can just
wire up filemap_migrate_folio.  This prepares for removing ->writepage,
which is used as a fallback if no migrate_folio method is set.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/cifs/file.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index cd9698209930..6be924caed39 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -5240,10 +5240,10 @@ const struct address_space_operations cifs_addr_ops = {
 	.direct_IO = cifs_direct_io,
 	.invalidate_folio = cifs_invalidate_folio,
 	.launder_folio = cifs_launder_folio,
+	.migrate_folio = filemap_migrate_folio,
 	/*
-	 * TODO: investigate and if useful we could add an cifs_migratePage
-	 * helper (under an CONFIG_MIGRATION) in the future, and also
-	 * investigate and add an is_dirty_writeback helper if needed
+	 * TODO: investigate and if useful we could add an is_dirty_writeback
+	 * helper if needed
 	 */
 	.swap_activate = cifs_swap_activate,
 	.swap_deactivate = cifs_swap_deactivate,
@@ -5264,4 +5264,5 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
 	.release_folio = cifs_release_folio,
 	.invalidate_folio = cifs_invalidate_folio,
 	.launder_folio = cifs_launder_folio,
+	.migrate_folio = filemap_migrate_folio,
 };
-- 
cgit 


From bff9018d3a52c45711bd63c446a2c80c0275e935 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 16 Nov 2022 14:18:34 +0100
Subject: cifs: stop using generic_writepages

generic_writepages is just a wrapper that calls ->writepages on a range,
and thus in the way of eventually removing ->writepage.  Switch cifs
to just open code it in preparation of removing ->writepage.

[note: I suspect just integrating the small wsize case with the rest
 of the writeback code might be a better idea here, but that needs
 someone more familiar with the code]

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/cifs/file.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 6be924caed39..ec14e38411a1 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2646,6 +2646,21 @@ wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages,
 	return rc;
 }
 
+static int
+cifs_writepage_locked(struct page *page, struct writeback_control *wbc);
+
+static int cifs_write_one_page(struct page *page, struct writeback_control *wbc,
+		void *data)
+{
+	struct address_space *mapping = data;
+	int ret;
+
+	ret = cifs_writepage_locked(page, wbc);
+	unlock_page(page);
+	mapping_set_error(mapping, ret);
+	return ret;
+}
+
 static int cifs_writepages(struct address_space *mapping,
 			   struct writeback_control *wbc)
 {
@@ -2662,10 +2677,11 @@ static int cifs_writepages(struct address_space *mapping,
 
 	/*
 	 * If wsize is smaller than the page cache size, default to writing
-	 * one page at a time via cifs_writepage
+	 * one page at a time.
 	 */
 	if (cifs_sb->ctx->wsize < PAGE_SIZE)
-		return generic_writepages(mapping, wbc);
+		return write_cache_pages(mapping, wbc, cifs_write_one_page,
+				mapping);
 
 	xid = get_xid();
 	if (wbc->range_cyclic) {
-- 
cgit 


From ebaad77c89921c8237ca17791d5462bd289052d0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 16 Nov 2022 14:18:35 +0100
Subject: cifs: remove ->writepage

->writepage is a very inefficient method to write back data, and only
used through write_cache_pages or a a fallback when no ->migrate_folio
method is present.  Now that cifs implements ->migrate_folio and
doesn't call generic_writepages, the writepage method can be removed.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/cifs/file.c | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index ec14e38411a1..6701257541ab 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2868,13 +2868,6 @@ retry_write:
 	return rc;
 }
 
-static int cifs_writepage(struct page *page, struct writeback_control *wbc)
-{
-	int rc = cifs_writepage_locked(page, wbc);
-	unlock_page(page);
-	return rc;
-}
-
 static int cifs_write_end(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
 			struct page *page, void *fsdata)
@@ -5247,7 +5240,6 @@ static bool cifs_dirty_folio(struct address_space *mapping, struct folio *folio)
 const struct address_space_operations cifs_addr_ops = {
 	.read_folio = cifs_read_folio,
 	.readahead = cifs_readahead,
-	.writepage = cifs_writepage,
 	.writepages = cifs_writepages,
 	.write_begin = cifs_write_begin,
 	.write_end = cifs_write_end,
@@ -5272,7 +5264,6 @@ const struct address_space_operations cifs_addr_ops = {
  */
 const struct address_space_operations cifs_addr_ops_smallbuf = {
 	.read_folio = cifs_read_folio,
-	.writepage = cifs_writepage,
 	.writepages = cifs_writepages,
 	.write_begin = cifs_write_begin,
 	.write_end = cifs_write_end,
-- 
cgit 


From d406d26745aba3365ab9171b2d5cbea9c1757305 Mon Sep 17 00:00:00 2001
From: Paulo Alcantara <pc@cjr.nz>
Date: Mon, 5 Dec 2022 23:31:53 -0300
Subject: cifs: skip alloc when request has no pages

When smb3_init_transform_rq() was being called with requests (@old_rq)
which had no pages, it was unnecessarily allocating a single page for
every request in @new_rq.

Fix this by skipping page array allocation when requests have no pages
(e.g. !smb_rqst::rq_npages).

Also get rid of deprecated kmap() and use kmap_local_page() instead
while we're at it.

Signed-off-by: Paulo Alcantara (SUSE) <pc@cjr.nz>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/cifs/smb2ops.c | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index bfaafd02fb1f..72b22d033ed5 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -4445,21 +4445,27 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst,
 	int rc = -ENOMEM;
 
 	for (i = 1; i < num_rqst; i++) {
-		npages = old_rq[i - 1].rq_npages;
+		struct smb_rqst *old = &old_rq[i - 1];
+		struct smb_rqst *new = &new_rq[i];
+
+		orig_len += smb_rqst_len(server, old);
+		new->rq_iov = old->rq_iov;
+		new->rq_nvec = old->rq_nvec;
+
+		npages = old->rq_npages;
+		if (!npages)
+			continue;
+
 		pages = kmalloc_array(npages, sizeof(struct page *),
 				      GFP_KERNEL);
 		if (!pages)
 			goto err_free;
 
-		new_rq[i].rq_pages = pages;
-		new_rq[i].rq_npages = npages;
-		new_rq[i].rq_offset = old_rq[i - 1].rq_offset;
-		new_rq[i].rq_pagesz = old_rq[i - 1].rq_pagesz;
-		new_rq[i].rq_tailsz = old_rq[i - 1].rq_tailsz;
-		new_rq[i].rq_iov = old_rq[i - 1].rq_iov;
-		new_rq[i].rq_nvec = old_rq[i - 1].rq_nvec;
-
-		orig_len += smb_rqst_len(server, &old_rq[i - 1]);
+		new->rq_pages = pages;
+		new->rq_npages = npages;
+		new->rq_offset = old->rq_offset;
+		new->rq_pagesz = old->rq_pagesz;
+		new->rq_tailsz = old->rq_tailsz;
 
 		for (j = 0; j < npages; j++) {
 			pages[j] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM);
@@ -4472,14 +4478,14 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst,
 			char *dst, *src;
 			unsigned int offset, len;
 
-			rqst_page_get_length(&new_rq[i], j, &len, &offset);
+			rqst_page_get_length(new, j, &len, &offset);
 
-			dst = (char *) kmap(new_rq[i].rq_pages[j]) + offset;
-			src = (char *) kmap(old_rq[i - 1].rq_pages[j]) + offset;
+			dst = kmap_local_page(new->rq_pages[j]) + offset;
+			src = kmap_local_page(old->rq_pages[j]) + offset;
 
 			memcpy(dst, src, len);
-			kunmap(new_rq[i].rq_pages[j]);
-			kunmap(old_rq[i - 1].rq_pages[j]);
+			kunmap(new->rq_pages[j]);
+			kunmap(old->rq_pages[j]);
 		}
 	}
 
-- 
cgit 


From 52f31ed228212ba572c44e15e818a3a5c74122c0 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 8 Dec 2022 08:29:22 -0800
Subject: xfs: dquot shrinker doesn't check for XFS_DQFLAG_FREEING

Resulting in a UAF if the shrinker races with some other dquot
freeing mechanism that sets XFS_DQFLAG_FREEING before the dquot is
removed from the LRU. This can occur if a dquot purge races with
drop_caches.

Reported-by: syzbot+912776840162c13db1a3@syzkaller.appspotmail.com
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_qm.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 18bb4ec4d7c9..ff53d40a2dae 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -422,6 +422,14 @@ xfs_qm_dquot_isolate(
 	if (!xfs_dqlock_nowait(dqp))
 		goto out_miss_busy;
 
+	/*
+	 * If something else is freeing this dquot and hasn't yet removed it
+	 * from the LRU, leave it for the freeing task to complete the freeing
+	 * process rather than risk it being free from under us here.
+	 */
+	if (dqp->q_flags & XFS_DQFLAG_FREEING)
+		goto out_miss_unlock;
+
 	/*
 	 * This dquot has acquired a reference in the meantime remove it from
 	 * the freelist and try again.
@@ -441,10 +449,8 @@ xfs_qm_dquot_isolate(
 	 * skip it so there is time for the IO to complete before we try to
 	 * reclaim it again on the next LRU pass.
 	 */
-	if (!xfs_dqflock_nowait(dqp)) {
-		xfs_dqunlock(dqp);
-		goto out_miss_busy;
-	}
+	if (!xfs_dqflock_nowait(dqp))
+		goto out_miss_unlock;
 
 	if (XFS_DQ_IS_DIRTY(dqp)) {
 		struct xfs_buf	*bp = NULL;
@@ -478,6 +484,8 @@ xfs_qm_dquot_isolate(
 	XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaims);
 	return LRU_REMOVED;
 
+out_miss_unlock:
+	xfs_dqunlock(dqp);
 out_miss_busy:
 	trace_xfs_dqreclaim_busy(dqp);
 	XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
-- 
cgit 


From 1c8a8ec0a0e9a1176022a35c4daf04fe1594d270 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 28 Nov 2022 10:43:44 +0100
Subject: f2fs: remove struct segment_allocation default_salloc_ops

There is only  single instance of these ops, so remove the indirection
and call allocate_segment_by_default directly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 11 ++---------
 fs/f2fs/segment.h |  6 ------
 2 files changed, 2 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 0ff451ea18f6..bbe6556799ce 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -2926,7 +2926,7 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type,
 		return;
 alloc:
 	old_segno = curseg->segno;
-	SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true);
+	allocate_segment_by_default(sbi, type, true);
 	locate_dirty_segment(sbi, old_segno);
 }
 
@@ -2957,10 +2957,6 @@ void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
 	f2fs_up_read(&SM_I(sbi)->curseg_lock);
 }
 
-static const struct segment_allocation default_salloc_ops = {
-	.allocate_segment = allocate_segment_by_default,
-};
-
 bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi,
 						struct cp_control *cpc)
 {
@@ -3284,7 +3280,7 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 			get_atssr_segment(sbi, type, se->type,
 						AT_SSR, se->mtime);
 		else
-			sit_i->s_ops->allocate_segment(sbi, type, false);
+			allocate_segment_by_default(sbi, type, false);
 	}
 	/*
 	 * segment dirty status should be updated after segment allocation,
@@ -4270,9 +4266,6 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
 		return -ENOMEM;
 #endif
 
-	/* init SIT information */
-	sit_i->s_ops = &default_salloc_ops;
-
 	sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr);
 	sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg;
 	sit_i->written_valid_blocks = 0;
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index be8f2d7d007b..3ad1b7b6fa94 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -222,10 +222,6 @@ struct sec_entry {
 	unsigned int valid_blocks;	/* # of valid blocks in a section */
 };
 
-struct segment_allocation {
-	void (*allocate_segment)(struct f2fs_sb_info *, int, bool);
-};
-
 #define MAX_SKIP_GC_COUNT			16
 
 struct revoke_entry {
@@ -235,8 +231,6 @@ struct revoke_entry {
 };
 
 struct sit_info {
-	const struct segment_allocation *s_ops;
-
 	block_t sit_base_addr;		/* start block address of SIT area */
 	block_t sit_blocks;		/* # of blocks used by SIT area */
 	block_t written_valid_blocks;	/* # of valid blocks in main area */
-- 
cgit 


From 8442d94b8ac8d5d8300725a9ffa9def526b71170 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 28 Nov 2022 10:43:45 +0100
Subject: f2fs: open code allocate_segment_by_default

allocate_segment_by_default has just two callers, which use very
different code pathes inside it based on the force paramter.  Just
open code the logic in the two callers using a new helper to decided
if a new segment should be allocated.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 50 ++++++++++++++++++++++++--------------------------
 1 file changed, 24 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index bbe6556799ce..c4e118eb7d19 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -2849,31 +2849,20 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type,
 	return 0;
 }
 
-/*
- * flush out current segment and replace it with new segment
- * This function should be returned with success, otherwise BUG
- */
-static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
-						int type, bool force)
+static bool need_new_seg(struct f2fs_sb_info *sbi, int type)
 {
 	struct curseg_info *curseg = CURSEG_I(sbi, type);
 
-	if (force)
-		new_curseg(sbi, type, true);
-	else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) &&
-					curseg->seg_type == CURSEG_WARM_NODE)
-		new_curseg(sbi, type, false);
-	else if (curseg->alloc_type == LFS &&
-			is_next_segment_free(sbi, curseg, type) &&
-			likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
-		new_curseg(sbi, type, false);
-	else if (f2fs_need_SSR(sbi) &&
-			get_ssr_segment(sbi, type, SSR, 0))
-		change_curseg(sbi, type, true);
-	else
-		new_curseg(sbi, type, false);
-
-	stat_inc_seg_type(sbi, curseg);
+	if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) &&
+	    curseg->seg_type == CURSEG_WARM_NODE)
+		return true;
+	if (curseg->alloc_type == LFS &&
+	    is_next_segment_free(sbi, curseg, type) &&
+	    likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
+		return true;
+	if (!f2fs_need_SSR(sbi) || !get_ssr_segment(sbi, type, SSR, 0))
+		return true;
+	return false;
 }
 
 void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
@@ -2926,7 +2915,8 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type,
 		return;
 alloc:
 	old_segno = curseg->segno;
-	allocate_segment_by_default(sbi, type, true);
+	new_curseg(sbi, type, true);
+	stat_inc_seg_type(sbi, curseg);
 	locate_dirty_segment(sbi, old_segno);
 }
 
@@ -3276,11 +3266,19 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 		update_sit_entry(sbi, old_blkaddr, -1);
 
 	if (!__has_curseg_space(sbi, curseg)) {
-		if (from_gc)
+		/*
+		 * Flush out current segment and replace it with new segment.
+		 */
+		if (from_gc) {
 			get_atssr_segment(sbi, type, se->type,
 						AT_SSR, se->mtime);
-		else
-			allocate_segment_by_default(sbi, type, false);
+		} else {
+			if (need_new_seg(sbi, type))
+				new_curseg(sbi, type, false);
+			else
+				change_curseg(sbi, type, true);
+			stat_inc_seg_type(sbi, curseg);
+		}
 	}
 	/*
 	 * segment dirty status should be updated after segment allocation,
-- 
cgit 


From 5bcd655fffaec24e849bda1207446f5cc821713e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 28 Nov 2022 10:43:46 +0100
Subject: f2fs: remove the unused flush argument to change_curseg

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index c4e118eb7d19..9486ca49ecb1 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -2656,7 +2656,7 @@ bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno)
  * This function always allocates a used segment(from dirty seglist) by SSR
  * manner, so it should recover the existing segment information of valid blocks
  */
-static void change_curseg(struct f2fs_sb_info *sbi, int type, bool flush)
+static void change_curseg(struct f2fs_sb_info *sbi, int type)
 {
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
 	struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -2664,9 +2664,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type, bool flush)
 	struct f2fs_summary_block *sum_node;
 	struct page *sum_page;
 
-	if (flush)
-		write_sum_page(sbi, curseg->sum_blk,
-					GET_SUM_BLOCK(sbi, curseg->segno));
+	write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, curseg->segno));
 
 	__set_test_and_inuse(sbi, new_segno);
 
@@ -2705,7 +2703,7 @@ static void get_atssr_segment(struct f2fs_sb_info *sbi, int type,
 		struct seg_entry *se = get_seg_entry(sbi, curseg->next_segno);
 
 		curseg->seg_type = se->type;
-		change_curseg(sbi, type, true);
+		change_curseg(sbi, type);
 	} else {
 		/* allocate cold segment by default */
 		curseg->seg_type = CURSEG_COLD_DATA;
@@ -2880,7 +2878,7 @@ void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
 		goto unlock;
 
 	if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type, SSR, 0))
-		change_curseg(sbi, type, true);
+		change_curseg(sbi, type);
 	else
 		new_curseg(sbi, type, true);
 
@@ -3276,7 +3274,7 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 			if (need_new_seg(sbi, type))
 				new_curseg(sbi, type, false);
 			else
-				change_curseg(sbi, type, true);
+				change_curseg(sbi, type);
 			stat_inc_seg_type(sbi, curseg);
 		}
 	}
@@ -3539,7 +3537,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 	/* change the current segment */
 	if (segno != curseg->segno) {
 		curseg->next_segno = segno;
-		change_curseg(sbi, type, true);
+		change_curseg(sbi, type);
 	}
 
 	curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
@@ -3567,7 +3565,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 	if (recover_curseg) {
 		if (old_cursegno != curseg->segno) {
 			curseg->next_segno = old_cursegno;
-			change_curseg(sbi, type, true);
+			change_curseg(sbi, type);
 		}
 		curseg->next_blkoff = old_blkoff;
 		curseg->alloc_type = old_alloc_type;
-- 
cgit 


From 870af777da22505851174a34c0228042d7ed5f5f Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Fri, 25 Nov 2022 19:47:36 +0800
Subject: f2fs: do some cleanup for f2fs module init

Just for cleanup, no functional changes.

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/compress.c | 46 ++++++----------------------------------------
 fs/f2fs/data.c     | 14 ++++----------
 fs/f2fs/gc.c       |  4 +---
 fs/f2fs/recovery.c |  4 +---
 fs/f2fs/super.c    |  8 ++------
 5 files changed, 14 insertions(+), 62 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 74d3f2d2271f..9723f0bed923 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -567,10 +567,7 @@ MODULE_PARM_DESC(num_compress_pages,
 int f2fs_init_compress_mempool(void)
 {
 	compress_page_pool = mempool_create_page_pool(num_compress_pages, 0);
-	if (!compress_page_pool)
-		return -ENOMEM;
-
-	return 0;
+	return compress_page_pool ? 0 : -ENOMEM;
 }
 
 void f2fs_destroy_compress_mempool(void)
@@ -1983,9 +1980,7 @@ int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi)
 
 	sbi->page_array_slab = f2fs_kmem_cache_create(slab_name,
 					sbi->page_array_slab_size);
-	if (!sbi->page_array_slab)
-		return -ENOMEM;
-	return 0;
+	return sbi->page_array_slab ? 0 : -ENOMEM;
 }
 
 void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi)
@@ -1993,53 +1988,24 @@ void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi)
 	kmem_cache_destroy(sbi->page_array_slab);
 }
 
-static int __init f2fs_init_cic_cache(void)
+int __init f2fs_init_compress_cache(void)
 {
 	cic_entry_slab = f2fs_kmem_cache_create("f2fs_cic_entry",
 					sizeof(struct compress_io_ctx));
 	if (!cic_entry_slab)
 		return -ENOMEM;
-	return 0;
-}
-
-static void f2fs_destroy_cic_cache(void)
-{
-	kmem_cache_destroy(cic_entry_slab);
-}
-
-static int __init f2fs_init_dic_cache(void)
-{
 	dic_entry_slab = f2fs_kmem_cache_create("f2fs_dic_entry",
 					sizeof(struct decompress_io_ctx));
 	if (!dic_entry_slab)
-		return -ENOMEM;
-	return 0;
-}
-
-static void f2fs_destroy_dic_cache(void)
-{
-	kmem_cache_destroy(dic_entry_slab);
-}
-
-int __init f2fs_init_compress_cache(void)
-{
-	int err;
-
-	err = f2fs_init_cic_cache();
-	if (err)
-		goto out;
-	err = f2fs_init_dic_cache();
-	if (err)
 		goto free_cic;
 	return 0;
 free_cic:
-	f2fs_destroy_cic_cache();
-out:
+	kmem_cache_destroy(cic_entry_slab);
 	return -ENOMEM;
 }
 
 void f2fs_destroy_compress_cache(void)
 {
-	f2fs_destroy_dic_cache();
-	f2fs_destroy_cic_cache();
+	kmem_cache_destroy(dic_entry_slab);
+	kmem_cache_destroy(cic_entry_slab);
 }
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 560fa80590e9..35c19248b1e2 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -39,10 +39,8 @@ static struct bio_set f2fs_bioset;
 
 int __init f2fs_init_bioset(void)
 {
-	if (bioset_init(&f2fs_bioset, F2FS_BIO_POOL_SIZE,
-					0, BIOSET_NEED_BVECS))
-		return -ENOMEM;
-	return 0;
+	return bioset_init(&f2fs_bioset, F2FS_BIO_POOL_SIZE,
+					0, BIOSET_NEED_BVECS);
 }
 
 void f2fs_destroy_bioset(void)
@@ -4090,9 +4088,7 @@ int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi)
 	sbi->post_read_wq = alloc_workqueue("f2fs_post_read_wq",
 						 WQ_UNBOUND | WQ_HIGHPRI,
 						 num_online_cpus());
-	if (!sbi->post_read_wq)
-		return -ENOMEM;
-	return 0;
+	return sbi->post_read_wq ? 0 : -ENOMEM;
 }
 
 void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi)
@@ -4105,9 +4101,7 @@ int __init f2fs_init_bio_entry_cache(void)
 {
 	bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab",
 			sizeof(struct bio_entry));
-	if (!bio_entry_slab)
-		return -ENOMEM;
-	return 0;
+	return bio_entry_slab ? 0 : -ENOMEM;
 }
 
 void f2fs_destroy_bio_entry_cache(void)
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index f1b68eda2235..d19e26b2e875 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1904,9 +1904,7 @@ int __init f2fs_create_garbage_collection_cache(void)
 {
 	victim_entry_slab = f2fs_kmem_cache_create("f2fs_victim_entry",
 					sizeof(struct victim_entry));
-	if (!victim_entry_slab)
-		return -ENOMEM;
-	return 0;
+	return victim_entry_slab ? 0 : -ENOMEM;
 }
 
 void f2fs_destroy_garbage_collection_cache(void)
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index dea95b48b647..77fd453949b1 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -923,9 +923,7 @@ int __init f2fs_create_recovery_cache(void)
 {
 	fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
 					sizeof(struct fsync_inode_entry));
-	if (!fsync_entry_slab)
-		return -ENOMEM;
-	return 0;
+	return fsync_entry_slab ? 0 : -ENOMEM;
 }
 
 void f2fs_destroy_recovery_cache(void)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index daf14b55a972..a5f6f632cf7c 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -288,9 +288,7 @@ static int __init f2fs_create_casefold_cache(void)
 {
 	f2fs_cf_name_slab = f2fs_kmem_cache_create("f2fs_casefolded_name",
 							F2FS_NAME_LEN);
-	if (!f2fs_cf_name_slab)
-		return -ENOMEM;
-	return 0;
+	return f2fs_cf_name_slab ? 0 : -ENOMEM;
 }
 
 static void f2fs_destroy_casefold_cache(void)
@@ -4647,9 +4645,7 @@ static int __init init_inodecache(void)
 	f2fs_inode_cachep = kmem_cache_create("f2fs_inode_cache",
 			sizeof(struct f2fs_inode_info), 0,
 			SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, NULL);
-	if (!f2fs_inode_cachep)
-		return -ENOMEM;
-	return 0;
+	return f2fs_inode_cachep ? 0 : -ENOMEM;
 }
 
 static void destroy_inodecache(void)
-- 
cgit 


From 78742d4d056df7d2fad241c90185d281bf924844 Mon Sep 17 00:00:00 2001
From: Luís Henriques <lhenriques@suse.de>
Date: Tue, 11 Oct 2022 16:57:58 +0100
Subject: ext4: remove trailing newline from ext4_msg() message
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The ext4_msg() function adds a new line to the message.  Remove extra '\n'
from call to ext4_msg() in ext4_orphan_cleanup().

Signed-off-by: Luís Henriques <lhenriques@suse.de>
Link: https://lore.kernel.org/r/20221011155758.15287-1-lhenriques@suse.de
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/orphan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c
index 69a9cf9137a6..e5b47dda3317 100644
--- a/fs/ext4/orphan.c
+++ b/fs/ext4/orphan.c
@@ -412,7 +412,7 @@ void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es)
 		/* don't clear list on RO mount w/ errors */
 		if (es->s_last_orphan && !(s_flags & SB_RDONLY)) {
 			ext4_msg(sb, KERN_INFO, "Errors on filesystem, "
-				  "clearing orphan list.\n");
+				  "clearing orphan list.");
 			es->s_last_orphan = 0;
 		}
 		ext4_debug("Skipping orphan recovery on fs with errors.\n");
-- 
cgit 


From d323877484765aaacbb2769b06e355c2041ed115 Mon Sep 17 00:00:00 2001
From: Baokun Li <libaokun1@huawei.com>
Date: Wed, 26 Oct 2022 12:23:07 +0800
Subject: ext4: fix bug_on in __es_tree_search caused by bad quota inode

We got a issue as fllows:
==================================================================
 kernel BUG at fs/ext4/extents_status.c:202!
 invalid opcode: 0000 [#1] PREEMPT SMP
 CPU: 1 PID: 810 Comm: mount Not tainted 6.1.0-rc1-next-g9631525255e3 #352
 RIP: 0010:__es_tree_search.isra.0+0xb8/0xe0
 RSP: 0018:ffffc90001227900 EFLAGS: 00010202
 RAX: 0000000000000000 RBX: 0000000077512a0f RCX: 0000000000000000
 RDX: 0000000000000002 RSI: 0000000000002a10 RDI: ffff8881004cd0c8
 RBP: ffff888177512ac8 R08: 47ffffffffffffff R09: 0000000000000001
 R10: 0000000000000001 R11: 00000000000679af R12: 0000000000002a10
 R13: ffff888177512d88 R14: 0000000077512a10 R15: 0000000000000000
 FS: 00007f4bd76dbc40(0000)GS:ffff88842fd00000(0000)knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 00005653bf993cf8 CR3: 000000017bfdf000 CR4: 00000000000006e0
 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
 Call Trace:
  <TASK>
  ext4_es_cache_extent+0xe2/0x210
  ext4_cache_extents+0xd2/0x110
  ext4_find_extent+0x5d5/0x8c0
  ext4_ext_map_blocks+0x9c/0x1d30
  ext4_map_blocks+0x431/0xa50
  ext4_getblk+0x82/0x340
  ext4_bread+0x14/0x110
  ext4_quota_read+0xf0/0x180
  v2_read_header+0x24/0x90
  v2_check_quota_file+0x2f/0xa0
  dquot_load_quota_sb+0x26c/0x760
  dquot_load_quota_inode+0xa5/0x190
  ext4_enable_quotas+0x14c/0x300
  __ext4_fill_super+0x31cc/0x32c0
  ext4_fill_super+0x115/0x2d0
  get_tree_bdev+0x1d2/0x360
  ext4_get_tree+0x19/0x30
  vfs_get_tree+0x26/0xe0
  path_mount+0x81d/0xfc0
  do_mount+0x8d/0xc0
  __x64_sys_mount+0xc0/0x160
  do_syscall_64+0x35/0x80
  entry_SYSCALL_64_after_hwframe+0x63/0xcd
  </TASK>
==================================================================

Above issue may happen as follows:
-------------------------------------
ext4_fill_super
 ext4_orphan_cleanup
  ext4_enable_quotas
   ext4_quota_enable
    ext4_iget --> get error inode <5>
     ext4_ext_check_inode --> Wrong imode makes it escape inspection
     make_bad_inode(inode) --> EXT4_BOOT_LOADER_INO set imode
    dquot_load_quota_inode
     vfs_setup_quota_inode --> check pass
     dquot_load_quota_sb
      v2_check_quota_file
       v2_read_header
        ext4_quota_read
         ext4_bread
          ext4_getblk
           ext4_map_blocks
            ext4_ext_map_blocks
             ext4_find_extent
              ext4_cache_extents
               ext4_es_cache_extent
                __es_tree_search.isra.0
                 ext4_es_end --> Wrong extents trigger BUG_ON

In the above issue, s_usr_quota_inum is set to 5, but inode<5> contains
incorrect imode and disordered extents. Because 5 is EXT4_BOOT_LOADER_INO,
the ext4_ext_check_inode check in the ext4_iget function can be bypassed,
finally, the extents that are not checked trigger the BUG_ON in the
__es_tree_search function. To solve this issue, check whether the inode is
bad_inode in vfs_setup_quota_inode().

Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20221026042310.3839669-2-libaokun1@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/quota/dquot.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 0427b44bfee5..f27faf5db554 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2324,6 +2324,8 @@ static int vfs_setup_quota_inode(struct inode *inode, int type)
 	struct super_block *sb = inode->i_sb;
 	struct quota_info *dqopt = sb_dqopt(sb);
 
+	if (is_bad_inode(inode))
+		return -EUCLEAN;
 	if (!S_ISREG(inode->i_mode))
 		return -EACCES;
 	if (IS_RDONLY(inode))
-- 
cgit 


From 07342ec259df2a35d6a34aebce010567a80a0e15 Mon Sep 17 00:00:00 2001
From: Baokun Li <libaokun1@huawei.com>
Date: Wed, 26 Oct 2022 12:23:08 +0800
Subject: ext4: add helper to check quota inums

Before quota is enabled, a check on the preset quota inums in
ext4_super_block is added to prevent wrong quota inodes from being loaded.
In addition, when the quota fails to be enabled, the quota type and quota
inum are printed to facilitate fault locating.

Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20221026042310.3839669-3-libaokun1@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/super.c | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 0f71542cf453..6f944f5e4b3f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -6886,6 +6886,20 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 	return err;
 }
 
+static inline bool ext4_check_quota_inum(int type, unsigned long qf_inum)
+{
+	switch (type) {
+	case USRQUOTA:
+		return qf_inum == EXT4_USR_QUOTA_INO;
+	case GRPQUOTA:
+		return qf_inum == EXT4_GRP_QUOTA_INO;
+	case PRJQUOTA:
+		return qf_inum >= EXT4_GOOD_OLD_FIRST_INO;
+	default:
+		BUG();
+	}
+}
+
 static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
 			     unsigned int flags)
 {
@@ -6902,9 +6916,16 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
 	if (!qf_inums[type])
 		return -EPERM;
 
+	if (!ext4_check_quota_inum(type, qf_inums[type])) {
+		ext4_error(sb, "Bad quota inum: %lu, type: %d",
+				qf_inums[type], type);
+		return -EUCLEAN;
+	}
+
 	qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL);
 	if (IS_ERR(qf_inode)) {
-		ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]);
+		ext4_error(sb, "Bad quota inode: %lu, type: %d",
+				qf_inums[type], type);
 		return PTR_ERR(qf_inode);
 	}
 
@@ -6943,8 +6964,9 @@ int ext4_enable_quotas(struct super_block *sb)
 			if (err) {
 				ext4_warning(sb,
 					"Failed to enable quota tracking "
-					"(type=%d, err=%d). Please run "
-					"e2fsck to fix.", type, err);
+					"(type=%d, err=%d, ino=%lu). "
+					"Please run e2fsck to fix.", type,
+					err, qf_inums[type]);
 				for (type--; type >= 0; type--) {
 					struct inode *inode;
 
-- 
cgit 


From 63b1e9bccb71fe7d7e3ddc9877dbdc85e5d2d023 Mon Sep 17 00:00:00 2001
From: Baokun Li <libaokun1@huawei.com>
Date: Wed, 26 Oct 2022 12:23:09 +0800
Subject: ext4: add EXT4_IGET_BAD flag to prevent unexpected bad inode

There are many places that will get unhappy (and crash) when ext4_iget()
returns a bad inode. However, if iget the boot loader inode, allows a bad
inode to be returned, because the inode may not be initialized. This
mechanism can be used to bypass some checks and cause panic. To solve this
problem, we add a special iget flag EXT4_IGET_BAD. Only with this flag
we'd be returning bad inode from ext4_iget(), otherwise we always return
the error code if the inode is bad inode.(suggested by Jan Kara)

Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20221026042310.3839669-4-libaokun1@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/ext4.h  | 3 ++-
 fs/ext4/inode.c | 8 +++++++-
 fs/ext4/ioctl.c | 3 ++-
 3 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8d5453852f98..2b574b143bde 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2964,7 +2964,8 @@ int do_journal_get_write_access(handle_t *handle, struct inode *inode,
 typedef enum {
 	EXT4_IGET_NORMAL =	0,
 	EXT4_IGET_SPECIAL =	0x0001, /* OK to iget a system inode */
-	EXT4_IGET_HANDLE = 	0x0002	/* Inode # is from a handle */
+	EXT4_IGET_HANDLE = 	0x0002,	/* Inode # is from a handle */
+	EXT4_IGET_BAD =		0x0004  /* Allow to iget a bad inode */
 } ext4_iget_flags;
 
 extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 6d16241666e6..a4bf643aa08b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5058,8 +5058,14 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
 	if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb))
 		ext4_error_inode(inode, function, line, 0,
 				 "casefold flag without casefold feature");
-	brelse(iloc.bh);
+	if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) {
+		ext4_error_inode(inode, function, line, 0,
+				 "bad inode without EXT4_IGET_BAD flag");
+		ret = -EUCLEAN;
+		goto bad_inode;
+	}
 
+	brelse(iloc.bh);
 	unlock_new_inode(inode);
 	return inode;
 
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 95dfea28bf4e..9ed7b9fe2132 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -374,7 +374,8 @@ static long swap_inode_boot_loader(struct super_block *sb,
 	blkcnt_t blocks;
 	unsigned short bytes;
 
-	inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, EXT4_IGET_SPECIAL);
+	inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO,
+			EXT4_IGET_SPECIAL | EXT4_IGET_BAD);
 	if (IS_ERR(inode_bl))
 		return PTR_ERR(inode_bl);
 	ei_bl = EXT4_I(inode_bl);
-- 
cgit 


From 991ed014de0840c5dc405b679168924afb2952ac Mon Sep 17 00:00:00 2001
From: Baokun Li <libaokun1@huawei.com>
Date: Wed, 26 Oct 2022 12:23:10 +0800
Subject: ext4: fix bug_on in __es_tree_search caused by bad boot loader inode

We got a issue as fllows:
==================================================================
 kernel BUG at fs/ext4/extents_status.c:203!
 invalid opcode: 0000 [#1] PREEMPT SMP
 CPU: 1 PID: 945 Comm: cat Not tainted 6.0.0-next-20221007-dirty #349
 RIP: 0010:ext4_es_end.isra.0+0x34/0x42
 RSP: 0018:ffffc9000143b768 EFLAGS: 00010203
 RAX: 0000000000000000 RBX: ffff8881769cd0b8 RCX: 0000000000000000
 RDX: 0000000000000000 RSI: ffffffff8fc27cf7 RDI: 00000000ffffffff
 RBP: ffff8881769cd0bc R08: 0000000000000000 R09: ffffc9000143b5f8
 R10: 0000000000000001 R11: 0000000000000001 R12: ffff8881769cd0a0
 R13: ffff8881768e5668 R14: 00000000768e52f0 R15: 0000000000000000
 FS: 00007f359f7f05c0(0000)GS:ffff88842fd00000(0000)knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 00007f359f5a2000 CR3: 000000017130c000 CR4: 00000000000006e0
 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
 Call Trace:
  <TASK>
  __es_tree_search.isra.0+0x6d/0xf5
  ext4_es_cache_extent+0xfa/0x230
  ext4_cache_extents+0xd2/0x110
  ext4_find_extent+0x5d5/0x8c0
  ext4_ext_map_blocks+0x9c/0x1d30
  ext4_map_blocks+0x431/0xa50
  ext4_mpage_readpages+0x48e/0xe40
  ext4_readahead+0x47/0x50
  read_pages+0x82/0x530
  page_cache_ra_unbounded+0x199/0x2a0
  do_page_cache_ra+0x47/0x70
  page_cache_ra_order+0x242/0x400
  ondemand_readahead+0x1e8/0x4b0
  page_cache_sync_ra+0xf4/0x110
  filemap_get_pages+0x131/0xb20
  filemap_read+0xda/0x4b0
  generic_file_read_iter+0x13a/0x250
  ext4_file_read_iter+0x59/0x1d0
  vfs_read+0x28f/0x460
  ksys_read+0x73/0x160
  __x64_sys_read+0x1e/0x30
  do_syscall_64+0x35/0x80
  entry_SYSCALL_64_after_hwframe+0x63/0xcd
  </TASK>
==================================================================

In the above issue, ioctl invokes the swap_inode_boot_loader function to
swap inode<5> and inode<12>. However, inode<5> contain incorrect imode and
disordered extents, and i_nlink is set to 1. The extents check for inode in
the ext4_iget function can be bypassed bacause 5 is EXT4_BOOT_LOADER_INO.
While links_count is set to 1, the extents are not initialized in
swap_inode_boot_loader. After the ioctl command is executed successfully,
the extents are swapped to inode<12>, in this case, run the `cat` command
to view inode<12>. And Bug_ON is triggered due to the incorrect extents.

When the boot loader inode is not initialized, its imode can be one of the
following:
1) the imode is a bad type, which is marked as bad_inode in ext4_iget and
   set to S_IFREG.
2) the imode is good type but not S_IFREG.
3) the imode is S_IFREG.

The BUG_ON may be triggered by bypassing the check in cases 1 and 2.
Therefore, when the boot loader inode is bad_inode or its imode is not
S_IFREG, initialize the inode to avoid triggering the BUG.

Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20221026042310.3839669-5-libaokun1@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 9ed7b9fe2132..e5f60057db5b 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -425,7 +425,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
 	/* Protect extent tree against block allocations via delalloc */
 	ext4_double_down_write_data_sem(inode, inode_bl);
 
-	if (inode_bl->i_nlink == 0) {
+	if (is_bad_inode(inode_bl) || !S_ISREG(inode_bl->i_mode)) {
 		/* this inode has never been used as a BOOT_LOADER */
 		set_nlink(inode_bl, 1);
 		i_uid_write(inode_bl, 0);
-- 
cgit 


From 3bf678a0f9c017c9ba7c581541dbc8453452a7ae Mon Sep 17 00:00:00 2001
From: Gaosheng Cui <cuigaosheng1@huawei.com>
Date: Mon, 31 Oct 2022 13:58:33 +0800
Subject: ext4: fix undefined behavior in bit shift for ext4_check_flag_values

Shifting signed 32-bit value by 31 bits is undefined, so changing
significant bit to unsigned. The UBSAN warning calltrace like below:

UBSAN: shift-out-of-bounds in fs/ext4/ext4.h:591:2
left shift of 1 by 31 places cannot be represented in type 'int'
Call Trace:
 <TASK>
 dump_stack_lvl+0x7d/0xa5
 dump_stack+0x15/0x1b
 ubsan_epilogue+0xe/0x4e
 __ubsan_handle_shift_out_of_bounds+0x1e7/0x20c
 ext4_init_fs+0x5a/0x277
 do_one_initcall+0x76/0x430
 kernel_init_freeable+0x3b3/0x422
 kernel_init+0x24/0x1e0
 ret_from_fork+0x1f/0x30
 </TASK>

Fixes: 9a4c80194713 ("ext4: ensure Inode flags consistency are checked at build time")
Signed-off-by: Gaosheng Cui <cuigaosheng1@huawei.com>
Link: https://lore.kernel.org/r/20221031055833.3966222-1-cuigaosheng1@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/ext4.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2b574b143bde..3afdd99bb214 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -558,7 +558,7 @@ enum {
  *
  * It's not paranoia if the Murphy's Law really *is* out to get you.  :-)
  */
-#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
+#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1U << EXT4_INODE_##FLAG))
 #define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG))
 
 static inline void ext4_check_flag_values(void)
-- 
cgit 


From 105c78e12468413e426625831faa7db4284e1fec Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 1 Nov 2022 22:33:12 -0700
Subject: ext4: don't allow journal inode to have encrypt flag

Mounting a filesystem whose journal inode has the encrypt flag causes a
NULL dereference in fscrypt_limit_io_blocks() when the 'inlinecrypt'
mount option is used.

The problem is that when jbd2_journal_init_inode() calls bmap(), it
eventually finds its way into ext4_iomap_begin(), which calls
fscrypt_limit_io_blocks().  fscrypt_limit_io_blocks() requires that if
the inode is encrypted, then its encryption key must already be set up.
That's not the case here, since the journal inode is never "opened" like
a normal file would be.  Hence the crash.

A reproducer is:

    mkfs.ext4 -F /dev/vdb
    debugfs -w /dev/vdb -R "set_inode_field <8> flags 0x80808"
    mount /dev/vdb /mnt -o inlinecrypt

To fix this, make ext4 consider journal inodes with the encrypt flag to
be invalid.  (Note, maybe other flags should be rejected on the journal
inode too.  For now, this is just the minimal fix for the above issue.)

I've marked this as fixing the commit that introduced the call to
fscrypt_limit_io_blocks(), since that's what made an actual crash start
being possible.  But this fix could be applied to any version of ext4
that supports the encrypt feature.

Reported-by: syzbot+ba9dac45bc76c490b7c3@syzkaller.appspotmail.com
Fixes: 38ea50daa7a4 ("ext4: support direct I/O with fscrypt using blk-crypto")
Cc: stable@vger.kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20221102053312.189962-1-ebiggers@kernel.org
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 6f944f5e4b3f..9185131e01e5 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5723,7 +5723,7 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
 
 	ext4_debug("Journal inode found at %p: %lld bytes\n",
 		  journal_inode, journal_inode->i_size);
-	if (!S_ISREG(journal_inode->i_mode)) {
+	if (!S_ISREG(journal_inode->i_mode) || IS_ENCRYPTED(journal_inode)) {
 		ext4_msg(sb, KERN_ERR, "invalid journal inode");
 		iput(journal_inode);
 		return NULL;
-- 
cgit 


From a71248b1accb2b42e4980afef4fa4a27fa0e36f5 Mon Sep 17 00:00:00 2001
From: Baokun Li <libaokun1@huawei.com>
Date: Wed, 2 Nov 2022 16:06:33 +0800
Subject: ext4: fix use-after-free in ext4_orphan_cleanup

I caught a issue as follows:
==================================================================
 BUG: KASAN: use-after-free in __list_add_valid+0x28/0x1a0
 Read of size 8 at addr ffff88814b13f378 by task mount/710

 CPU: 1 PID: 710 Comm: mount Not tainted 6.1.0-rc3-next #370
 Call Trace:
  <TASK>
  dump_stack_lvl+0x73/0x9f
  print_report+0x25d/0x759
  kasan_report+0xc0/0x120
  __asan_load8+0x99/0x140
  __list_add_valid+0x28/0x1a0
  ext4_orphan_cleanup+0x564/0x9d0 [ext4]
  __ext4_fill_super+0x48e2/0x5300 [ext4]
  ext4_fill_super+0x19f/0x3a0 [ext4]
  get_tree_bdev+0x27b/0x450
  ext4_get_tree+0x19/0x30 [ext4]
  vfs_get_tree+0x49/0x150
  path_mount+0xaae/0x1350
  do_mount+0xe2/0x110
  __x64_sys_mount+0xf0/0x190
  do_syscall_64+0x35/0x80
  entry_SYSCALL_64_after_hwframe+0x63/0xcd
  </TASK>
 [...]
==================================================================

Above issue may happen as follows:
-------------------------------------
ext4_fill_super
  ext4_orphan_cleanup
   --- loop1: assume last_orphan is 12 ---
    list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan)
    ext4_truncate --> return 0
      ext4_inode_attach_jinode --> return -ENOMEM
    iput(inode) --> free inode<12>
   --- loop2: last_orphan is still 12 ---
    list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
    // use inode<12> and trigger UAF

To solve this issue, we need to propagate the return value of
ext4_inode_attach_jinode() appropriately.

Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20221102080633.1630225-1-libaokun1@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a4bf643aa08b..181bc161b1ac 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4231,7 +4231,8 @@ int ext4_truncate(struct inode *inode)
 
 	/* If we zero-out tail of the page, we have to create jinode for jbd2 */
 	if (inode->i_size & (inode->i_sb->s_blocksize - 1)) {
-		if (ext4_inode_attach_jinode(inode) < 0)
+		err = ext4_inode_attach_jinode(inode);
+		if (err)
 			goto out_trace;
 	}
 
-- 
cgit 


From 0fbcb5251fc81b58969b272c4fb7374a7b922e3e Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 6 Nov 2022 14:48:35 -0800
Subject: ext4: disable fast-commit of encrypted dir operations

fast-commit of create, link, and unlink operations in encrypted
directories is completely broken because the unencrypted filenames are
being written to the fast-commit journal instead of the encrypted
filenames.  These operations can't be replayed, as encryption keys
aren't present at journal replay time.  It is also an information leak.

Until if/when we can get this working properly, make encrypted directory
operations ineligible for fast-commit.

Note that fast-commit operations on encrypted regular files continue to
be allowed, as they seem to work.

Fixes: aa75f4d3daae ("ext4: main fast-commit commit path")
Cc: <stable@vger.kernel.org> # v5.10+
Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20221106224841.279231-2-ebiggers@kernel.org
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/fast_commit.c | 41 +++++++++++++++++++++++++----------------
 fs/ext4/fast_commit.h |  1 +
 2 files changed, 26 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 0f6d0a80467d..6d98f2b39b77 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -420,25 +420,34 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)
 	struct __track_dentry_update_args *dentry_update =
 		(struct __track_dentry_update_args *)arg;
 	struct dentry *dentry = dentry_update->dentry;
-	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct inode *dir = dentry->d_parent->d_inode;
+	struct super_block *sb = inode->i_sb;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
 	mutex_unlock(&ei->i_fc_lock);
+
+	if (IS_ENCRYPTED(dir)) {
+		ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME,
+					NULL);
+		mutex_lock(&ei->i_fc_lock);
+		return -EOPNOTSUPP;
+	}
+
 	node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
 	if (!node) {
-		ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
+		ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL);
 		mutex_lock(&ei->i_fc_lock);
 		return -ENOMEM;
 	}
 
 	node->fcd_op = dentry_update->op;
-	node->fcd_parent = dentry->d_parent->d_inode->i_ino;
+	node->fcd_parent = dir->i_ino;
 	node->fcd_ino = inode->i_ino;
 	if (dentry->d_name.len > DNAME_INLINE_LEN) {
 		node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
 		if (!node->fcd_name.name) {
 			kmem_cache_free(ext4_fc_dentry_cachep, node);
-			ext4_fc_mark_ineligible(inode->i_sb,
-				EXT4_FC_REASON_NOMEM, NULL);
+			ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL);
 			mutex_lock(&ei->i_fc_lock);
 			return -ENOMEM;
 		}
@@ -2249,17 +2258,17 @@ void ext4_fc_init(struct super_block *sb, journal_t *journal)
 	journal->j_fc_cleanup_callback = ext4_fc_cleanup;
 }
 
-static const char *fc_ineligible_reasons[] = {
-	"Extended attributes changed",
-	"Cross rename",
-	"Journal flag changed",
-	"Insufficient memory",
-	"Swap boot",
-	"Resize",
-	"Dir renamed",
-	"Falloc range op",
-	"Data journalling",
-	"FC Commit Failed"
+static const char * const fc_ineligible_reasons[] = {
+	[EXT4_FC_REASON_XATTR] = "Extended attributes changed",
+	[EXT4_FC_REASON_CROSS_RENAME] = "Cross rename",
+	[EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed",
+	[EXT4_FC_REASON_NOMEM] = "Insufficient memory",
+	[EXT4_FC_REASON_SWAP_BOOT] = "Swap boot",
+	[EXT4_FC_REASON_RESIZE] = "Resize",
+	[EXT4_FC_REASON_RENAME_DIR] = "Dir renamed",
+	[EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op",
+	[EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling",
+	[EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename",
 };
 
 int ext4_fc_info_show(struct seq_file *seq, void *v)
diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h
index a6154c3ed135..256f2ad27204 100644
--- a/fs/ext4/fast_commit.h
+++ b/fs/ext4/fast_commit.h
@@ -96,6 +96,7 @@ enum {
 	EXT4_FC_REASON_RENAME_DIR,
 	EXT4_FC_REASON_FALLOC_RANGE,
 	EXT4_FC_REASON_INODE_JOURNAL_DATA,
+	EXT4_FC_REASON_ENCRYPTED_FILENAME,
 	EXT4_FC_REASON_MAX
 };
 
-- 
cgit 


From 4c0d5778385cb3618ff26a561ce41de2b7d9de70 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 6 Nov 2022 14:48:36 -0800
Subject: ext4: don't set up encryption key during jbd2 transaction

Commit a80f7fcf1867 ("ext4: fixup ext4_fc_track_* functions' signature")
extended the scope of the transaction in ext4_unlink() too far, making
it include the call to ext4_find_entry().  However, ext4_find_entry()
can deadlock when called from within a transaction because it may need
to set up the directory's encryption key.

Fix this by restoring the transaction to its original scope.

Reported-by: syzbot+1a748d0007eeac3ab079@syzkaller.appspotmail.com
Fixes: a80f7fcf1867 ("ext4: fixup ext4_fc_track_* functions' signature")
Cc: <stable@vger.kernel.org> # v5.10+
Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20221106224841.279231-3-ebiggers@kernel.org
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h        |  4 ++--
 fs/ext4/fast_commit.c |  2 +-
 fs/ext4/namei.c       | 44 ++++++++++++++++++++++++--------------------
 3 files changed, 27 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3afdd99bb214..4e739902dc03 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3620,8 +3620,8 @@ extern void ext4_initialize_dirent_tail(struct buffer_head *bh,
 					unsigned int blocksize);
 extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode,
 				      struct buffer_head *bh);
-extern int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name,
-			 struct inode *inode);
+extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
+			 struct inode *inode, struct dentry *dentry);
 extern int __ext4_link(struct inode *dir, struct inode *inode,
 		       struct dentry *dentry);
 
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 6d98f2b39b77..da0c8228cf9c 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -1397,7 +1397,7 @@ static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
 		return 0;
 	}
 
-	ret = __ext4_unlink(NULL, old_parent, &entry, inode);
+	ret = __ext4_unlink(old_parent, &entry, inode, NULL);
 	/* -ENOENT ok coz it might not exist anymore. */
 	if (ret == -ENOENT)
 		ret = 0;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index c08c0aba1883..a789ea9b61a0 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3204,14 +3204,20 @@ end_rmdir:
 	return retval;
 }
 
-int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name,
-		  struct inode *inode)
+int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
+		  struct inode *inode,
+		  struct dentry *dentry /* NULL during fast_commit recovery */)
 {
 	int retval = -ENOENT;
 	struct buffer_head *bh;
 	struct ext4_dir_entry_2 *de;
+	handle_t *handle;
 	int skip_remove_dentry = 0;
 
+	/*
+	 * Keep this outside the transaction; it may have to set up the
+	 * directory's encryption key, which isn't GFP_NOFS-safe.
+	 */
 	bh = ext4_find_entry(dir, d_name, &de, NULL);
 	if (IS_ERR(bh))
 		return PTR_ERR(bh);
@@ -3228,7 +3234,14 @@ int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name
 		if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
 			skip_remove_dentry = 1;
 		else
-			goto out;
+			goto out_bh;
+	}
+
+	handle = ext4_journal_start(dir, EXT4_HT_DIR,
+				    EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
+	if (IS_ERR(handle)) {
+		retval = PTR_ERR(handle);
+		goto out_bh;
 	}
 
 	if (IS_DIRSYNC(dir))
@@ -3237,12 +3250,12 @@ int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name
 	if (!skip_remove_dentry) {
 		retval = ext4_delete_entry(handle, dir, de, bh);
 		if (retval)
-			goto out;
+			goto out_handle;
 		dir->i_ctime = dir->i_mtime = current_time(dir);
 		ext4_update_dx_flag(dir);
 		retval = ext4_mark_inode_dirty(handle, dir);
 		if (retval)
-			goto out;
+			goto out_handle;
 	} else {
 		retval = 0;
 	}
@@ -3255,15 +3268,17 @@ int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name
 		ext4_orphan_add(handle, inode);
 	inode->i_ctime = current_time(inode);
 	retval = ext4_mark_inode_dirty(handle, inode);
-
-out:
+	if (dentry && !retval)
+		ext4_fc_track_unlink(handle, dentry);
+out_handle:
+	ext4_journal_stop(handle);
+out_bh:
 	brelse(bh);
 	return retval;
 }
 
 static int ext4_unlink(struct inode *dir, struct dentry *dentry)
 {
-	handle_t *handle;
 	int retval;
 
 	if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
@@ -3281,16 +3296,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
 	if (retval)
 		goto out_trace;
 
-	handle = ext4_journal_start(dir, EXT4_HT_DIR,
-				    EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
-	if (IS_ERR(handle)) {
-		retval = PTR_ERR(handle);
-		goto out_trace;
-	}
-
-	retval = __ext4_unlink(handle, dir, &dentry->d_name, d_inode(dentry));
-	if (!retval)
-		ext4_fc_track_unlink(handle, dentry);
+	retval = __ext4_unlink(dir, &dentry->d_name, d_inode(dentry), dentry);
 #if IS_ENABLED(CONFIG_UNICODE)
 	/* VFS negative dentries are incompatible with Encoding and
 	 * Case-insensitiveness. Eventually we'll want avoid
@@ -3301,8 +3307,6 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
 	if (IS_CASEFOLDED(dir))
 		d_invalidate(dentry);
 #endif
-	if (handle)
-		ext4_journal_stop(handle);
 
 out_trace:
 	trace_ext4_unlink_exit(dentry, retval);
-- 
cgit 


From 594bc43b410316d70bb42aeff168837888d96810 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 6 Nov 2022 14:48:37 -0800
Subject: ext4: fix leaking uninitialized memory in fast-commit journal

When space at the end of fast-commit journal blocks is unused, make sure
to zero it out so that uninitialized memory is not leaked to disk.

Fixes: aa75f4d3daae ("ext4: main fast-commit commit path")
Cc: <stable@vger.kernel.org> # v5.10+
Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20221106224841.279231-4-ebiggers@kernel.org
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/fast_commit.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index da0c8228cf9c..1e8be0554239 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -737,6 +737,9 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
 		*crc = ext4_chksum(sbi, *crc, tl, EXT4_FC_TAG_BASE_LEN);
 	if (pad_len > 0)
 		ext4_fc_memzero(sb, tl + 1, pad_len, crc);
+	/* Don't leak uninitialized memory in the unused last byte. */
+	*((u8 *)(tl + 1) + pad_len) = 0;
+
 	ext4_fc_submit_bh(sb, false);
 
 	ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
@@ -793,6 +796,8 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
 	dst += sizeof(tail.fc_tid);
 	tail.fc_crc = cpu_to_le32(crc);
 	ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
+	dst += sizeof(tail.fc_crc);
+	memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */
 
 	ext4_fc_submit_bh(sb, true);
 
-- 
cgit 


From 64b4a25c3de81a69724e888ec2db3533b43816e2 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 6 Nov 2022 14:48:38 -0800
Subject: ext4: add missing validation of fast-commit record lengths

Validate the inode and filename lengths in fast-commit journal records
so that a malicious fast-commit journal cannot cause a crash by having
invalid values for these.  Also validate EXT4_FC_TAG_DEL_RANGE.

Fixes: aa75f4d3daae ("ext4: main fast-commit commit path")
Cc: <stable@vger.kernel.org> # v5.10+
Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20221106224841.279231-5-ebiggers@kernel.org
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/fast_commit.c | 38 +++++++++++++++++++-------------------
 fs/ext4/fast_commit.h |  2 +-
 2 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 1e8be0554239..d5ad4b2b235d 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -1991,32 +1991,31 @@ void ext4_fc_replay_cleanup(struct super_block *sb)
 	kfree(sbi->s_fc_replay_state.fc_modified_inodes);
 }
 
-static inline bool ext4_fc_tag_len_isvalid(struct ext4_fc_tl *tl,
-					   u8 *val, u8 *end)
+static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi,
+				      int tag, int len)
 {
-	if (val + tl->fc_len > end)
-		return false;
-
-	/* Here only check ADD_RANGE/TAIL/HEAD which will read data when do
-	 * journal rescan before do CRC check. Other tags length check will
-	 * rely on CRC check.
-	 */
-	switch (tl->fc_tag) {
+	switch (tag) {
 	case EXT4_FC_TAG_ADD_RANGE:
-		return (sizeof(struct ext4_fc_add_range) == tl->fc_len);
-	case EXT4_FC_TAG_TAIL:
-		return (sizeof(struct ext4_fc_tail) <= tl->fc_len);
-	case EXT4_FC_TAG_HEAD:
-		return (sizeof(struct ext4_fc_head) == tl->fc_len);
+		return len == sizeof(struct ext4_fc_add_range);
 	case EXT4_FC_TAG_DEL_RANGE:
+		return len == sizeof(struct ext4_fc_del_range);
+	case EXT4_FC_TAG_CREAT:
 	case EXT4_FC_TAG_LINK:
 	case EXT4_FC_TAG_UNLINK:
-	case EXT4_FC_TAG_CREAT:
+		len -= sizeof(struct ext4_fc_dentry_info);
+		return len >= 1 && len <= EXT4_NAME_LEN;
 	case EXT4_FC_TAG_INODE:
+		len -= sizeof(struct ext4_fc_inode);
+		return len >= EXT4_GOOD_OLD_INODE_SIZE &&
+			len <= sbi->s_inode_size;
 	case EXT4_FC_TAG_PAD:
-	default:
-		return true;
+		return true; /* padding can have any length */
+	case EXT4_FC_TAG_TAIL:
+		return len >= sizeof(struct ext4_fc_tail);
+	case EXT4_FC_TAG_HEAD:
+		return len == sizeof(struct ext4_fc_head);
 	}
+	return false;
 }
 
 /*
@@ -2079,7 +2078,8 @@ static int ext4_fc_replay_scan(journal_t *journal,
 	     cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
 		ext4_fc_get_tl(&tl, cur);
 		val = cur + EXT4_FC_TAG_BASE_LEN;
-		if (!ext4_fc_tag_len_isvalid(&tl, val, end)) {
+		if (tl.fc_len > end - val ||
+		    !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) {
 			ret = state->fc_replay_num_tags ?
 				JBD2_FC_REPLAY_STOP : -ECANCELED;
 			goto out_err;
diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h
index 256f2ad27204..2fadb2c4780c 100644
--- a/fs/ext4/fast_commit.h
+++ b/fs/ext4/fast_commit.h
@@ -58,7 +58,7 @@ struct ext4_fc_dentry_info {
 	__u8 fc_dname[];
 };
 
-/* Value structure for EXT4_FC_TAG_INODE and EXT4_FC_TAG_INODE_PARTIAL. */
+/* Value structure for EXT4_FC_TAG_INODE. */
 struct ext4_fc_inode {
 	__le32 fc_ino;
 	__u8 fc_raw_inode[];
-- 
cgit 


From 8415ce07ecf0cc25efdd5db264a7133716e503cf Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 6 Nov 2022 14:48:39 -0800
Subject: ext4: fix unaligned memory access in ext4_fc_reserve_space()

As is done elsewhere in the file, build the struct ext4_fc_tl on the
stack and memcpy() it into the buffer, rather than directly writing it
to a potentially-unaligned location in the buffer.

Fixes: aa75f4d3daae ("ext4: main fast-commit commit path")
Cc: <stable@vger.kernel.org> # v5.10+
Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20221106224841.279231-6-ebiggers@kernel.org
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/fast_commit.c | 39 +++++++++++++++++++++------------------
 1 file changed, 21 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index d5ad4b2b235d..892fa7c7a768 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -675,6 +675,15 @@ static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
 
 /* Ext4 commit path routines */
 
+/* memcpy to fc reserved space and update CRC */
+static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
+				int len, u32 *crc)
+{
+	if (crc)
+		*crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
+	return memcpy(dst, src, len);
+}
+
 /* memzero and update CRC */
 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
 				u32 *crc)
@@ -700,12 +709,13 @@ static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
  */
 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
 {
-	struct ext4_fc_tl *tl;
+	struct ext4_fc_tl tl;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct buffer_head *bh;
 	int bsize = sbi->s_journal->j_blocksize;
 	int ret, off = sbi->s_fc_bytes % bsize;
 	int pad_len;
+	u8 *dst;
 
 	/*
 	 * After allocating len, we should have space at least for a 0 byte
@@ -729,16 +739,18 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
 		return sbi->s_fc_bh->b_data + off;
 	}
 	/* Need to add PAD tag */
-	tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
-	tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
+	dst = sbi->s_fc_bh->b_data + off;
+	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
 	pad_len = bsize - off - 1 - EXT4_FC_TAG_BASE_LEN;
-	tl->fc_len = cpu_to_le16(pad_len);
-	if (crc)
-		*crc = ext4_chksum(sbi, *crc, tl, EXT4_FC_TAG_BASE_LEN);
-	if (pad_len > 0)
-		ext4_fc_memzero(sb, tl + 1, pad_len, crc);
+	tl.fc_len = cpu_to_le16(pad_len);
+	ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc);
+	dst += EXT4_FC_TAG_BASE_LEN;
+	if (pad_len > 0) {
+		ext4_fc_memzero(sb, dst, pad_len, crc);
+		dst += pad_len;
+	}
 	/* Don't leak uninitialized memory in the unused last byte. */
-	*((u8 *)(tl + 1) + pad_len) = 0;
+	*dst = 0;
 
 	ext4_fc_submit_bh(sb, false);
 
@@ -750,15 +762,6 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
 	return sbi->s_fc_bh->b_data;
 }
 
-/* memcpy to fc reserved space and update CRC */
-static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
-				int len, u32 *crc)
-{
-	if (crc)
-		*crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
-	return memcpy(dst, src, len);
-}
-
 /*
  * Complete a fast commit by writing tail tag.
  *
-- 
cgit 


From 48a6a66db82b8043d298a630f22c62d43550cae5 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 6 Nov 2022 14:48:40 -0800
Subject: ext4: fix off-by-one errors in fast-commit block filling

Due to several different off-by-one errors, or perhaps due to a late
change in design that wasn't fully reflected in the code that was
actually merged, there are several very strange constraints on how
fast-commit blocks are filled with tlv entries:

- tlvs must start at least 10 bytes before the end of the block, even
  though the minimum tlv length is 8.  Otherwise, the replay code will
  ignore them.  (BUG: ext4_fc_reserve_space() could violate this
  requirement if called with a len of blocksize - 9 or blocksize - 8.
  Fortunately, this doesn't seem to happen currently.)

- tlvs must end at least 1 byte before the end of the block.  Otherwise
  the replay code will consider them to be invalid.  This quirk
  contributed to a bug (fixed by an earlier commit) where uninitialized
  memory was being leaked to disk in the last byte of blocks.

Also, strangely these constraints don't apply to the replay code in
e2fsprogs, which will accept any tlvs in the blocks (with no bounds
checks at all, but that is a separate issue...).

Given that this all seems to be a bug, let's fix it by just filling
blocks with tlv entries in the natural way.

Note that old kernels will be unable to replay fast-commit journals
created by kernels that have this commit.

Fixes: aa75f4d3daae ("ext4: main fast-commit commit path")
Cc: <stable@vger.kernel.org> # v5.10+
Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20221106224841.279231-7-ebiggers@kernel.org
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/fast_commit.c | 66 +++++++++++++++++++++++++--------------------------
 1 file changed, 33 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 892fa7c7a768..7ed71c652f67 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -714,43 +714,43 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
 	struct buffer_head *bh;
 	int bsize = sbi->s_journal->j_blocksize;
 	int ret, off = sbi->s_fc_bytes % bsize;
-	int pad_len;
+	int remaining;
 	u8 *dst;
 
 	/*
-	 * After allocating len, we should have space at least for a 0 byte
-	 * padding.
+	 * If 'len' is too long to fit in any block alongside a PAD tlv, then we
+	 * cannot fulfill the request.
 	 */
-	if (len + EXT4_FC_TAG_BASE_LEN > bsize)
+	if (len > bsize - EXT4_FC_TAG_BASE_LEN)
 		return NULL;
 
-	if (bsize - off - 1 > len + EXT4_FC_TAG_BASE_LEN) {
-		/*
-		 * Only allocate from current buffer if we have enough space for
-		 * this request AND we have space to add a zero byte padding.
-		 */
-		if (!sbi->s_fc_bh) {
-			ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
-			if (ret)
-				return NULL;
-			sbi->s_fc_bh = bh;
-		}
-		sbi->s_fc_bytes += len;
-		return sbi->s_fc_bh->b_data + off;
+	if (!sbi->s_fc_bh) {
+		ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
+		if (ret)
+			return NULL;
+		sbi->s_fc_bh = bh;
 	}
-	/* Need to add PAD tag */
 	dst = sbi->s_fc_bh->b_data + off;
+
+	/*
+	 * Allocate the bytes in the current block if we can do so while still
+	 * leaving enough space for a PAD tlv.
+	 */
+	remaining = bsize - EXT4_FC_TAG_BASE_LEN - off;
+	if (len <= remaining) {
+		sbi->s_fc_bytes += len;
+		return dst;
+	}
+
+	/*
+	 * Else, terminate the current block with a PAD tlv, then allocate a new
+	 * block and allocate the bytes at the start of that new block.
+	 */
+
 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
-	pad_len = bsize - off - 1 - EXT4_FC_TAG_BASE_LEN;
-	tl.fc_len = cpu_to_le16(pad_len);
+	tl.fc_len = cpu_to_le16(remaining);
 	ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc);
-	dst += EXT4_FC_TAG_BASE_LEN;
-	if (pad_len > 0) {
-		ext4_fc_memzero(sb, dst, pad_len, crc);
-		dst += pad_len;
-	}
-	/* Don't leak uninitialized memory in the unused last byte. */
-	*dst = 0;
+	ext4_fc_memzero(sb, dst + EXT4_FC_TAG_BASE_LEN, remaining, crc);
 
 	ext4_fc_submit_bh(sb, false);
 
@@ -758,7 +758,7 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
 	if (ret)
 		return NULL;
 	sbi->s_fc_bh = bh;
-	sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
+	sbi->s_fc_bytes += bsize - off + len;
 	return sbi->s_fc_bh->b_data;
 }
 
@@ -789,7 +789,7 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
 	off = sbi->s_fc_bytes % bsize;
 
 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
-	tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
+	tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail));
 	sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
 
 	ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, &crc);
@@ -2056,7 +2056,7 @@ static int ext4_fc_replay_scan(journal_t *journal,
 	state = &sbi->s_fc_replay_state;
 
 	start = (u8 *)bh->b_data;
-	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
+	end = start + journal->j_blocksize;
 
 	if (state->fc_replay_expected_off == 0) {
 		state->fc_cur_tag = 0;
@@ -2077,7 +2077,7 @@ static int ext4_fc_replay_scan(journal_t *journal,
 	}
 
 	state->fc_replay_expected_off++;
-	for (cur = start; cur < end - EXT4_FC_TAG_BASE_LEN;
+	for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
 	     cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
 		ext4_fc_get_tl(&tl, cur);
 		val = cur + EXT4_FC_TAG_BASE_LEN;
@@ -2195,9 +2195,9 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
 #endif
 
 	start = (u8 *)bh->b_data;
-	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
+	end = start + journal->j_blocksize;
 
-	for (cur = start; cur < end - EXT4_FC_TAG_BASE_LEN;
+	for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
 	     cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
 		ext4_fc_get_tl(&tl, cur);
 		val = cur + EXT4_FC_TAG_BASE_LEN;
-- 
cgit 


From 8805dbcb3e83a4e5a6c91edc15643a7498e576ce Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 6 Nov 2022 14:48:41 -0800
Subject: ext4: simplify fast-commit CRC calculation

Instead of checksumming each field as it is added to the block, just
checksum each block before it is written.  This is simpler, and also
much more efficient.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20221106224841.279231-8-ebiggers@kernel.org
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/fast_commit.c | 54 +++++++++++++++------------------------------------
 1 file changed, 16 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 7ed71c652f67..1da85f626116 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -675,27 +675,6 @@ static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
 
 /* Ext4 commit path routines */
 
-/* memcpy to fc reserved space and update CRC */
-static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
-				int len, u32 *crc)
-{
-	if (crc)
-		*crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
-	return memcpy(dst, src, len);
-}
-
-/* memzero and update CRC */
-static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
-				u32 *crc)
-{
-	void *ret;
-
-	ret = memset(dst, 0, len);
-	if (crc)
-		*crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
-	return ret;
-}
-
 /*
  * Allocate len bytes on a fast commit buffer.
  *
@@ -749,8 +728,9 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
 
 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
 	tl.fc_len = cpu_to_le16(remaining);
-	ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc);
-	ext4_fc_memzero(sb, dst + EXT4_FC_TAG_BASE_LEN, remaining, crc);
+	memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
+	memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining);
+	*crc = ext4_chksum(sbi, *crc, sbi->s_fc_bh->b_data, bsize);
 
 	ext4_fc_submit_bh(sb, false);
 
@@ -792,13 +772,15 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
 	tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail));
 	sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
 
-	ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, &crc);
+	memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
 	dst += EXT4_FC_TAG_BASE_LEN;
 	tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
-	ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
+	memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid));
 	dst += sizeof(tail.fc_tid);
+	crc = ext4_chksum(sbi, crc, sbi->s_fc_bh->b_data,
+			  dst - (u8 *)sbi->s_fc_bh->b_data);
 	tail.fc_crc = cpu_to_le32(crc);
-	ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
+	memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc));
 	dst += sizeof(tail.fc_crc);
 	memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */
 
@@ -824,8 +806,8 @@ static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
 	tl.fc_tag = cpu_to_le16(tag);
 	tl.fc_len = cpu_to_le16(len);
 
-	ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc);
-	ext4_fc_memcpy(sb, dst + EXT4_FC_TAG_BASE_LEN, val, len, crc);
+	memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
+	memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len);
 
 	return true;
 }
@@ -847,11 +829,11 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
 	fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
 	tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
 	tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
-	ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc);
+	memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
 	dst += EXT4_FC_TAG_BASE_LEN;
-	ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
+	memcpy(dst, &fcd, sizeof(fcd));
 	dst += sizeof(fcd);
-	ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc);
+	memcpy(dst, fc_dentry->fcd_name.name, dlen);
 
 	return true;
 }
@@ -889,15 +871,11 @@ static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
 	if (!dst)
 		goto err;
 
-	if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc))
-		goto err;
+	memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
 	dst += EXT4_FC_TAG_BASE_LEN;
-	if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
-		goto err;
+	memcpy(dst, &fc_inode, sizeof(fc_inode));
 	dst += sizeof(fc_inode);
-	if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
-					inode_len, crc))
-		goto err;
+	memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len);
 	ret = 0;
 err:
 	brelse(iloc.bh);
-- 
cgit 


From fae381a3d79bb94aa2eb752170d47458d778b797 Mon Sep 17 00:00:00 2001
From: Ye Bin <yebin10@huawei.com>
Date: Mon, 7 Nov 2022 09:53:35 +0800
Subject: ext4: init quota for 'old.inode' in 'ext4_rename'

Syzbot found the following issue:
ext4_parse_param: s_want_extra_isize=128
ext4_inode_info_init: s_want_extra_isize=32
ext4_rename: old.inode=ffff88823869a2c8 old.dir=ffff888238699828 new.inode=ffff88823869d7e8 new.dir=ffff888238699828
__ext4_mark_inode_dirty: inode=ffff888238699828 ea_isize=32 want_ea_size=128
__ext4_mark_inode_dirty: inode=ffff88823869a2c8 ea_isize=32 want_ea_size=128
ext4_xattr_block_set: inode=ffff88823869a2c8
------------[ cut here ]------------
WARNING: CPU: 13 PID: 2234 at fs/ext4/xattr.c:2070 ext4_xattr_block_set.cold+0x22/0x980
Modules linked in:
RIP: 0010:ext4_xattr_block_set.cold+0x22/0x980
RSP: 0018:ffff888227d3f3b0 EFLAGS: 00010202
RAX: 0000000000000001 RBX: ffff88823007a000 RCX: 0000000000000000
RDX: 0000000000000a03 RSI: 0000000000000040 RDI: ffff888230078178
RBP: 0000000000000000 R08: 000000000000002c R09: ffffed1075c7df8e
R10: ffff8883ae3efc6b R11: ffffed1075c7df8d R12: 0000000000000000
R13: ffff88823869a2c8 R14: ffff8881012e0460 R15: dffffc0000000000
FS:  00007f350ac1f740(0000) GS:ffff8883ae200000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f350a6ed6a0 CR3: 0000000237456000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 <TASK>
 ? ext4_xattr_set_entry+0x3b7/0x2320
 ? ext4_xattr_block_set+0x0/0x2020
 ? ext4_xattr_set_entry+0x0/0x2320
 ? ext4_xattr_check_entries+0x77/0x310
 ? ext4_xattr_ibody_set+0x23b/0x340
 ext4_xattr_move_to_block+0x594/0x720
 ext4_expand_extra_isize_ea+0x59a/0x10f0
 __ext4_expand_extra_isize+0x278/0x3f0
 __ext4_mark_inode_dirty.cold+0x347/0x410
 ext4_rename+0xed3/0x174f
 vfs_rename+0x13a7/0x2510
 do_renameat2+0x55d/0x920
 __x64_sys_rename+0x7d/0xb0
 do_syscall_64+0x3b/0xa0
 entry_SYSCALL_64_after_hwframe+0x72/0xdc

As 'ext4_rename' will modify 'old.inode' ctime and mark inode dirty,
which may trigger expand 'extra_isize' and allocate block. If inode
didn't init quota will lead to warning.  To solve above issue, init
'old.inode' firstly in 'ext4_rename'.

Reported-by: syzbot+98346927678ac3059c77@syzkaller.appspotmail.com
Signed-off-by: Ye Bin <yebin10@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20221107015335.2524319-1-yebin@huaweicloud.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/namei.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index a789ea9b61a0..1c5518a4bdf9 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3796,6 +3796,9 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
 		return -EXDEV;
 
 	retval = dquot_initialize(old.dir);
+	if (retval)
+		return retval;
+	retval = dquot_initialize(old.inode);
 	if (retval)
 		return retval;
 	retval = dquot_initialize(new.dir);
-- 
cgit 


From bb0fbc782ee9fa8c970c8e8e17c1a52ff9419182 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Tue, 8 Nov 2022 15:50:42 +0100
Subject: ext4: print file system UUID on mount, remount and unmount

The device names are not necessarily consistent across reboots which can
make it more difficult to identify the right file system when tracking
down issues using system logs.

Print file system UUID string on every mount, remount and unmount to
make this task easier.

This is similar to the functionality recently propsed for XFS.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Cc: Lukas Herbolt <lukas@herbolt.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Link: https://lore.kernel.org/r/20221108145042.85770-1-lczerner@redhat.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/super.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9185131e01e5..f5e6919ea650 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1206,7 +1206,8 @@ static void ext4_put_super(struct super_block *sb)
 	ext4_unregister_sysfs(sb);
 
 	if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs unmount"))
-		ext4_msg(sb, KERN_INFO, "unmounting filesystem.");
+		ext4_msg(sb, KERN_INFO, "unmounting filesystem %pU.",
+			 &sb->s_uuid);
 
 	ext4_unregister_li_request(sb);
 	ext4_quota_off_umount(sb);
@@ -5655,8 +5656,9 @@ static int ext4_fill_super(struct super_block *sb, struct fs_context *fc)
 		descr = "out journal";
 
 	if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount"))
-		ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
-			 "Quota mode: %s.", descr, ext4_quota_mode(sb));
+		ext4_msg(sb, KERN_INFO, "mounted filesystem %pU with%s. "
+			 "Quota mode: %s.", &sb->s_uuid, descr,
+			 ext4_quota_mode(sb));
 
 	/* Update the s_overhead_clusters if necessary */
 	ext4_update_overhead(sb, false);
@@ -6611,8 +6613,8 @@ static int ext4_reconfigure(struct fs_context *fc)
 	if (ret < 0)
 		return ret;
 
-	ext4_msg(sb, KERN_INFO, "re-mounted. Quota mode: %s.",
-		 ext4_quota_mode(sb));
+	ext4_msg(sb, KERN_INFO, "re-mounted %pU. Quota mode: %s.",
+		 &sb->s_uuid, ext4_quota_mode(sb));
 
 	return 0;
 }
-- 
cgit 


From 89481b5fa8c0640e62ba84c6020cee895f7ac643 Mon Sep 17 00:00:00 2001
From: Baokun Li <libaokun1@huawei.com>
Date: Wed, 9 Nov 2022 15:43:43 +0800
Subject: ext4: correct inconsistent error msg in nojournal mode

When we used the journal_async_commit mounting option in nojournal mode,
the kernel told me that "can't mount with journal_checksum", was very
confusing. I find that when we mount with journal_async_commit, both the
JOURNAL_ASYNC_COMMIT and EXPLICIT_JOURNAL_CHECKSUM flags are set. However,
in the error branch, CHECKSUM is checked before ASYNC_COMMIT. As a result,
the above inconsistency occurs, and the ASYNC_COMMIT branch becomes dead
code that cannot be executed. Therefore, we exchange the positions of the
two judgments to make the error msg more accurate.

Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20221109074343.4184862-1-libaokun1@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/super.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f5e6919ea650..878be47faaaf 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5288,14 +5288,15 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
 		goto failed_mount3a;
 	} else {
 		/* Nojournal mode, all journal mount options are illegal */
-		if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) {
+		if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
 			ext4_msg(sb, KERN_ERR, "can't mount with "
-				 "journal_checksum, fs mounted w/o journal");
+				 "journal_async_commit, fs mounted w/o journal");
 			goto failed_mount3a;
 		}
-		if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+
+		if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) {
 			ext4_msg(sb, KERN_ERR, "can't mount with "
-				 "journal_async_commit, fs mounted w/o journal");
+				 "journal_checksum, fs mounted w/o journal");
 			goto failed_mount3a;
 		}
 		if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
-- 
cgit 


From 060f77392cab1c1cefb8a27cf6bd61f1db2441ec Mon Sep 17 00:00:00 2001
From: JunChao Sun <sunjunchao2870@gmail.com>
Date: Wed, 9 Nov 2022 07:38:22 -0800
Subject: ext4: replace kmem_cache_create with KMEM_CACHE

Replace kmem_cache_create with KMEM_CACHE macro that
guaranteed struct alignment

Signed-off-by: JunChao Sun <sunjunchao2870@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20221109153822.80250-1-sunjunchao2870@gmail.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/extents_status.c | 8 ++------
 fs/ext4/readpage.c       | 5 ++---
 2 files changed, 4 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index cd0a861853e3..97eccc0028a1 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -155,9 +155,7 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
 
 int __init ext4_init_es(void)
 {
-	ext4_es_cachep = kmem_cache_create("ext4_extent_status",
-					   sizeof(struct extent_status),
-					   0, (SLAB_RECLAIM_ACCOUNT), NULL);
+	ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT);
 	if (ext4_es_cachep == NULL)
 		return -ENOMEM;
 	return 0;
@@ -1807,9 +1805,7 @@ static void ext4_print_pending_tree(struct inode *inode)
 
 int __init ext4_init_pending(void)
 {
-	ext4_pending_cachep = kmem_cache_create("ext4_pending_reservation",
-					   sizeof(struct pending_reservation),
-					   0, (SLAB_RECLAIM_ACCOUNT), NULL);
+	ext4_pending_cachep = KMEM_CACHE(pending_reservation, SLAB_RECLAIM_ACCOUNT);
 	if (ext4_pending_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 3d21eae267fc..773176e7f9f5 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -410,9 +410,8 @@ int ext4_mpage_readpages(struct inode *inode,
 
 int __init ext4_init_post_read_processing(void)
 {
-	bio_post_read_ctx_cache =
-		kmem_cache_create("ext4_bio_post_read_ctx",
-				  sizeof(struct bio_post_read_ctx), 0, 0, NULL);
+	bio_post_read_ctx_cache = KMEM_CACHE(bio_post_read_ctx, SLAB_RECLAIM_ACCOUNT);
+
 	if (!bio_post_read_ctx_cache)
 		goto fail;
 	bio_post_read_ctx_pool =
-- 
cgit 


From 26d75a16af285a70863ba6a81f85d81e7e65da50 Mon Sep 17 00:00:00 2001
From: Luís Henriques <lhenriques@suse.de>
Date: Wed, 9 Nov 2022 18:14:45 +0000
Subject: ext4: fix error code return to user-space in ext4_get_branch()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If a block is out of range in ext4_get_branch(), -ENOMEM will be returned
to user-space.  Obviously, this error code isn't really useful.  This
patch fixes it by making sure the right error code (-EFSCORRUPTED) is
propagated to user-space.  EUCLEAN is more informative than ENOMEM.

Signed-off-by: Luís Henriques <lhenriques@suse.de>
Link: https://lore.kernel.org/r/20221109181445.17843-1-lhenriques@suse.de
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/indirect.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 860fc5119009..c68bebe7ff4b 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -148,6 +148,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
 	struct super_block *sb = inode->i_sb;
 	Indirect *p = chain;
 	struct buffer_head *bh;
+	unsigned int key;
 	int ret = -EIO;
 
 	*err = 0;
@@ -156,7 +157,13 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
 	if (!p->key)
 		goto no_block;
 	while (--depth) {
-		bh = sb_getblk(sb, le32_to_cpu(p->key));
+		key = le32_to_cpu(p->key);
+		if (key > ext4_blocks_count(EXT4_SB(sb)->s_es)) {
+			/* the block was out of range */
+			ret = -EFSCORRUPTED;
+			goto failure;
+		}
+		bh = sb_getblk(sb, key);
 		if (unlikely(!bh)) {
 			ret = -ENOMEM;
 			goto failure;
-- 
cgit 


From b76abb5157468756163fe7e3431c9fe32cba57ca Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Thu, 10 Nov 2022 12:16:29 -0800
Subject: ext4: dont return EINVAL from GETFSUUID when reporting UUID length

If userspace calls this ioctl with fsu_length (the length of the
fsuuid.fsu_uuid array) set to zero, ext4 copies the desired uuid length
out to userspace.  The kernel call returned a result from a valid input,
so the return value here should be zero, not EINVAL.

While we're at it, fix the copy_to_user call to make it clear that we're
only copying out fsu_len.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Catherine Hoang <catherine.hoang@oracle.com>
Link: https://lore.kernel.org/r/166811138914.327006.9241306894437166566.stgit@magnolia
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/ioctl.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index e5f60057db5b..beedaebab21c 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -1154,9 +1154,10 @@ static int ext4_ioctl_getuuid(struct ext4_sb_info *sbi,
 
 	if (fsuuid.fsu_len == 0) {
 		fsuuid.fsu_len = UUID_SIZE;
-		if (copy_to_user(ufsuuid, &fsuuid, sizeof(fsuuid.fsu_len)))
+		if (copy_to_user(&ufsuuid->fsu_len, &fsuuid.fsu_len,
+					sizeof(fsuuid.fsu_len)))
 			return -EFAULT;
-		return -EINVAL;
+		return 0;
 	}
 
 	if (fsuuid.fsu_len != UUID_SIZE || fsuuid.fsu_flags != 0)
-- 
cgit 


From a7e9d977e031fceefe1e7cd69ebd7202d5758b56 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Thu, 10 Nov 2022 12:16:34 -0800
Subject: ext4: don't fail GETFSUUID when the caller provides a long buffer

If userspace provides a longer UUID buffer than is required, we
shouldn't fail the call with EINVAL -- rather, we can fill the caller's
buffer with the bytes we /can/ fill, and update the length field to
reflect what we copied.  This doesn't break the UAPI since we're
enabling a case that currently fails, and so far Ted hasn't released a
version of e2fsprogs that uses the new ext4 ioctl.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Catherine Hoang <catherine.hoang@oracle.com>
Link: https://lore.kernel.org/r/166811139478.327006.13879198441587445544.stgit@magnolia
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/ioctl.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index beedaebab21c..202953b5db49 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -1160,14 +1160,16 @@ static int ext4_ioctl_getuuid(struct ext4_sb_info *sbi,
 		return 0;
 	}
 
-	if (fsuuid.fsu_len != UUID_SIZE || fsuuid.fsu_flags != 0)
+	if (fsuuid.fsu_len < UUID_SIZE || fsuuid.fsu_flags != 0)
 		return -EINVAL;
 
 	lock_buffer(sbi->s_sbh);
 	memcpy(uuid, sbi->s_es->s_uuid, UUID_SIZE);
 	unlock_buffer(sbi->s_sbh);
 
-	if (copy_to_user(&ufsuuid->fsu_uuid[0], uuid, UUID_SIZE))
+	fsuuid.fsu_len = UUID_SIZE;
+	if (copy_to_user(ufsuuid, &fsuuid, sizeof(fsuuid)) ||
+	    copy_to_user(&ufsuuid->fsu_uuid[0], uuid, UUID_SIZE))
 		return -EFAULT;
 	return 0;
 }
-- 
cgit 


From a408f33e895e455f16cf964cb5cd4979b658db7b Mon Sep 17 00:00:00 2001
From: Baokun Li <libaokun1@huawei.com>
Date: Thu, 17 Nov 2022 12:03:39 +0800
Subject: ext4: fix bad checksum after online resize

When online resizing is performed twice consecutively, the error message
"Superblock checksum does not match superblock" is displayed for the
second time. Here's the reproducer:

	mkfs.ext4 -F /dev/sdb 100M
	mount /dev/sdb /tmp/test
	resize2fs /dev/sdb 5G
	resize2fs /dev/sdb 6G

To solve this issue, we moved the update of the checksum after the
es->s_overhead_clusters is updated.

Fixes: 026d0d27c488 ("ext4: reduce computation of overhead during resize")
Fixes: de394a86658f ("ext4: update s_overhead_clusters in the superblock during an on-line resize")
Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: stable@kernel.org
Link: https://lore.kernel.org/r/20221117040341.1380702-2-libaokun1@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/resize.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 6e88a85abd7f..70fef50ab3f9 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1476,8 +1476,6 @@ static void ext4_update_super(struct super_block *sb,
 	 * active. */
 	ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) +
 				reserved_blocks);
-	ext4_superblock_csum_set(sb);
-	unlock_buffer(sbi->s_sbh);
 
 	/* Update the free space counts */
 	percpu_counter_add(&sbi->s_freeclusters_counter,
@@ -1513,6 +1511,8 @@ static void ext4_update_super(struct super_block *sb,
 		ext4_calculate_overhead(sb);
 	es->s_overhead_clusters = cpu_to_le32(sbi->s_overhead);
 
+	ext4_superblock_csum_set(sb);
+	unlock_buffer(sbi->s_sbh);
 	if (test_opt(sb, DEBUG))
 		printk(KERN_DEBUG "EXT4-fs: added group %u:"
 		       "%llu blocks(%llu free %llu reserved)\n", flex_gd->count,
-- 
cgit 


From 8f49ec603ae3e213bfab2799182724e3abac55a1 Mon Sep 17 00:00:00 2001
From: Baokun Li <libaokun1@huawei.com>
Date: Thu, 17 Nov 2022 12:03:40 +0800
Subject: ext4: fix corrupt backup group descriptors after online resize

In commit 9a8c5b0d0615 ("ext4: update the backup superblock's at the end
of the online resize"), it is assumed that update_backups() only updates
backup superblocks, so each b_data is treated as a backupsuper block to
update its s_block_group_nr and s_checksum. However, update_backups()
also updates the backup group descriptors, which causes the backup group
descriptors to be corrupted.

The above commit fixes the problem of invalid checksum of the backup
superblock. The root cause of this problem is that the checksum of
ext4_update_super() is not set correctly. This problem has been fixed
in the previous patch ("ext4: fix bad checksum after online resize").

However, we do need to set block_group_nr for the backup superblock in
update_backups(). When a block is in a group that contains a backup
superblock, and the block is the first block in the group, the block is
definitely a superblock. We add a helper function that includes setting
s_block_group_nr and updating checksum, and then call it only when the
above conditions are met to prevent the backup group descriptors from
being incorrectly modified.

Fixes: 9a8c5b0d0615 ("ext4: update the backup superblock's at the end of the online resize")
Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: stable@kernel.org
Link: https://lore.kernel.org/r/20221117040341.1380702-3-libaokun1@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/resize.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 70fef50ab3f9..d460440d6dee 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1110,6 +1110,16 @@ exit_free:
 	return err;
 }
 
+static inline void ext4_set_block_group_nr(struct super_block *sb, char *data,
+					   ext4_group_t group)
+{
+	struct ext4_super_block *es = (struct ext4_super_block *) data;
+
+	es->s_block_group_nr = cpu_to_le16(group);
+	if (ext4_has_metadata_csum(sb))
+		es->s_checksum = ext4_superblock_csum(sb, es);
+}
+
 /*
  * Update the backup copies of the ext4 metadata.  These don't need to be part
  * of the main resize transaction, because e2fsck will re-write them if there
@@ -1158,7 +1168,8 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data,
 	while (group < sbi->s_groups_count) {
 		struct buffer_head *bh;
 		ext4_fsblk_t backup_block;
-		struct ext4_super_block *es;
+		int has_super = ext4_bg_has_super(sb, group);
+		ext4_fsblk_t first_block = ext4_group_first_block_no(sb, group);
 
 		/* Out of journal space, and can't get more - abort - so sad */
 		err = ext4_resize_ensure_credits_batch(handle, 1);
@@ -1168,8 +1179,7 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data,
 		if (meta_bg == 0)
 			backup_block = ((ext4_fsblk_t)group) * bpg + blk_off;
 		else
-			backup_block = (ext4_group_first_block_no(sb, group) +
-					ext4_bg_has_super(sb, group));
+			backup_block = first_block + has_super;
 
 		bh = sb_getblk(sb, backup_block);
 		if (unlikely(!bh)) {
@@ -1187,10 +1197,8 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data,
 		memcpy(bh->b_data, data, size);
 		if (rest)
 			memset(bh->b_data + size, 0, rest);
-		es = (struct ext4_super_block *) bh->b_data;
-		es->s_block_group_nr = cpu_to_le16(group);
-		if (ext4_has_metadata_csum(sb))
-			es->s_checksum = ext4_superblock_csum(sb, es);
+		if (has_super && (backup_block == first_block))
+			ext4_set_block_group_nr(sb, bh->b_data, group);
 		set_buffer_uptodate(bh);
 		unlock_buffer(bh);
 		err = ext4_handle_dirty_metadata(handle, NULL, bh);
-- 
cgit 


From 0aeaa2559d6d53358fca3e3fce73807367adca74 Mon Sep 17 00:00:00 2001
From: Baokun Li <libaokun1@huawei.com>
Date: Thu, 17 Nov 2022 12:03:41 +0800
Subject: ext4: fix corruption when online resizing a 1K bigalloc fs

When a backup superblock is updated in update_backups(), the primary
superblock's offset in the group (that is, sbi->s_sbh->b_blocknr) is used
as the backup superblock's offset in its group. However, when the block
size is 1K and bigalloc is enabled, the two offsets are not equal. This
causes the backup group descriptors to be overwritten by the superblock
in update_backups(). Moreover, if meta_bg is enabled, the file system will
be corrupted because this feature uses backup group descriptors.

To solve this issue, we use a more accurate ext4_group_first_block_no() as
the offset of the backup superblock in its group.

Fixes: d77147ff443b ("ext4: add support for online resizing with bigalloc")
Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: stable@kernel.org
Link: https://lore.kernel.org/r/20221117040341.1380702-4-libaokun1@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/resize.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index d460440d6dee..6b91443d6bf3 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1604,8 +1604,8 @@ exit_journal:
 		int meta_bg = ext4_has_feature_meta_bg(sb);
 		sector_t old_gdb = 0;
 
-		update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
-			       sizeof(struct ext4_super_block), 0);
+		update_backups(sb, ext4_group_first_block_no(sb, 0),
+			       (char *)es, sizeof(struct ext4_super_block), 0);
 		for (; gdb_num <= gdb_num_end; gdb_num++) {
 			struct buffer_head *gdb_bh;
 
@@ -1816,7 +1816,7 @@ errout:
 		if (test_opt(sb, DEBUG))
 			printk(KERN_DEBUG "EXT4-fs: extended group to %llu "
 			       "blocks\n", ext4_blocks_count(es));
-		update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr,
+		update_backups(sb, ext4_group_first_block_no(sb, 0),
 			       (char *)es, sizeof(struct ext4_super_block), 0);
 	}
 	return err;
-- 
cgit 


From 7ea71af94eaaaf6d9aed24bc94a05b977a741cb9 Mon Sep 17 00:00:00 2001
From: Ye Bin <yebin10@huawei.com>
Date: Thu, 17 Nov 2022 15:36:03 +0800
Subject: ext4: fix uninititialized value in 'ext4_evict_inode'

Syzbot found the following issue:
=====================================================
BUG: KMSAN: uninit-value in ext4_evict_inode+0xdd/0x26b0 fs/ext4/inode.c:180
 ext4_evict_inode+0xdd/0x26b0 fs/ext4/inode.c:180
 evict+0x365/0x9a0 fs/inode.c:664
 iput_final fs/inode.c:1747 [inline]
 iput+0x985/0xdd0 fs/inode.c:1773
 __ext4_new_inode+0xe54/0x7ec0 fs/ext4/ialloc.c:1361
 ext4_mknod+0x376/0x840 fs/ext4/namei.c:2844
 vfs_mknod+0x79d/0x830 fs/namei.c:3914
 do_mknodat+0x47d/0xaa0
 __do_sys_mknodat fs/namei.c:3992 [inline]
 __se_sys_mknodat fs/namei.c:3989 [inline]
 __ia32_sys_mknodat+0xeb/0x150 fs/namei.c:3989
 do_syscall_32_irqs_on arch/x86/entry/common.c:112 [inline]
 __do_fast_syscall_32+0xa2/0x100 arch/x86/entry/common.c:178
 do_fast_syscall_32+0x33/0x70 arch/x86/entry/common.c:203
 do_SYSENTER_32+0x1b/0x20 arch/x86/entry/common.c:246
 entry_SYSENTER_compat_after_hwframe+0x70/0x82

Uninit was created at:
 __alloc_pages+0x9f1/0xe80 mm/page_alloc.c:5578
 alloc_pages+0xaae/0xd80 mm/mempolicy.c:2285
 alloc_slab_page mm/slub.c:1794 [inline]
 allocate_slab+0x1b5/0x1010 mm/slub.c:1939
 new_slab mm/slub.c:1992 [inline]
 ___slab_alloc+0x10c3/0x2d60 mm/slub.c:3180
 __slab_alloc mm/slub.c:3279 [inline]
 slab_alloc_node mm/slub.c:3364 [inline]
 slab_alloc mm/slub.c:3406 [inline]
 __kmem_cache_alloc_lru mm/slub.c:3413 [inline]
 kmem_cache_alloc_lru+0x6f3/0xb30 mm/slub.c:3429
 alloc_inode_sb include/linux/fs.h:3117 [inline]
 ext4_alloc_inode+0x5f/0x860 fs/ext4/super.c:1321
 alloc_inode+0x83/0x440 fs/inode.c:259
 new_inode_pseudo fs/inode.c:1018 [inline]
 new_inode+0x3b/0x430 fs/inode.c:1046
 __ext4_new_inode+0x2a7/0x7ec0 fs/ext4/ialloc.c:959
 ext4_mkdir+0x4d5/0x1560 fs/ext4/namei.c:2992
 vfs_mkdir+0x62a/0x870 fs/namei.c:4035
 do_mkdirat+0x466/0x7b0 fs/namei.c:4060
 __do_sys_mkdirat fs/namei.c:4075 [inline]
 __se_sys_mkdirat fs/namei.c:4073 [inline]
 __ia32_sys_mkdirat+0xc4/0x120 fs/namei.c:4073
 do_syscall_32_irqs_on arch/x86/entry/common.c:112 [inline]
 __do_fast_syscall_32+0xa2/0x100 arch/x86/entry/common.c:178
 do_fast_syscall_32+0x33/0x70 arch/x86/entry/common.c:203
 do_SYSENTER_32+0x1b/0x20 arch/x86/entry/common.c:246
 entry_SYSENTER_compat_after_hwframe+0x70/0x82

CPU: 1 PID: 4625 Comm: syz-executor.2 Not tainted 6.1.0-rc4-syzkaller-62821-gcb231e2f67ec #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/26/2022
=====================================================

Now, 'ext4_alloc_inode()' didn't init 'ei->i_flags'. If new inode failed
before set 'ei->i_flags' in '__ext4_new_inode()', then do 'iput()'. As after
6bc0d63dad7f commit will access 'ei->i_flags' in 'ext4_evict_inode()' which
will lead to access uninit-value.
To solve above issue just init 'ei->i_flags' in 'ext4_alloc_inode()'.

Reported-by: syzbot+57b25da729eb0b88177d@syzkaller.appspotmail.com
Signed-off-by: Ye Bin <yebin10@huawei.com>
Fixes: 6bc0d63dad7f ("ext4: remove EA inode entry from mbcache on inode eviction")
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20221117073603.2598882-1-yebin@huaweicloud.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/super.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 878be47faaaf..28d009151d23 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1324,6 +1324,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 		return NULL;
 
 	inode_set_iversion(&ei->vfs_inode, 1);
+	ei->i_flags = 0;
 	spin_lock_init(&ei->i_raw_lock);
 	INIT_LIST_HEAD(&ei->i_prealloc_list);
 	atomic_set(&ei->i_prealloc_active, 0);
-- 
cgit 


From 131294c35ed6f777bd4e79d42af13b5c41bf2775 Mon Sep 17 00:00:00 2001
From: Eric Whitney <enwlinux@gmail.com>
Date: Thu, 17 Nov 2022 10:22:07 -0500
Subject: ext4: fix delayed allocation bug in ext4_clu_mapped for bigalloc +
 inline

When converting files with inline data to extents, delayed allocations
made on a file system created with both the bigalloc and inline options
can result in invalid extent status cache content, incorrect reserved
cluster counts, kernel memory leaks, and potential kernel panics.

With bigalloc, the code that determines whether a block must be
delayed allocated searches the extent tree to see if that block maps
to a previously allocated cluster.  If not, the block is delayed
allocated, and otherwise, it isn't.  However, if the inline option is
also used, and if the file containing the block is marked as able to
store data inline, there isn't a valid extent tree associated with
the file.  The current code in ext4_clu_mapped() calls
ext4_find_extent() to search the non-existent tree for a previously
allocated cluster anyway, which typically finds nothing, as desired.
However, a side effect of the search can be to cache invalid content
from the non-existent tree (garbage) in the extent status tree,
including bogus entries in the pending reservation tree.

To fix this, avoid searching the extent tree when allocating blocks
for bigalloc + inline files that are being converted from inline to
extent mapped.

Signed-off-by: Eric Whitney <enwlinux@gmail.com>
Link: https://lore.kernel.org/r/20221117152207.2424-1-enwlinux@gmail.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/extents.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ec008278d970..9de1c9d1a13d 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -5797,6 +5797,14 @@ int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
 	struct ext4_extent *extent;
 	ext4_lblk_t first_lblk, first_lclu, last_lclu;
 
+	/*
+	 * if data can be stored inline, the logical cluster isn't
+	 * mapped - no physical clusters have been allocated, and the
+	 * file has no extents
+	 */
+	if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
+		return 0;
+
 	/* search for the extent closest to the first block in the cluster */
 	path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0);
 	if (IS_ERR(path)) {
-- 
cgit 


From 956510c0c7439e90b8103aaeaf4da92878c622f0 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Mon, 21 Nov 2022 12:21:30 +0100
Subject: fs: ext4: initialize fsdata in pagecache_write()

When aops->write_begin() does not initialize fsdata, KMSAN reports
an error passing the latter to aops->write_end().

Fix this by unconditionally initializing fsdata.

Cc: Eric Biggers <ebiggers@kernel.org>
Fixes: c93d8f885809 ("ext4: add basic fs-verity support")
Reported-by: syzbot+9767be679ef5016b6082@syzkaller.appspotmail.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20221121112134.407362-1-glider@google.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/verity.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index 3c640bd7ecae..30e3b65798b5 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -79,7 +79,7 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count,
 		size_t n = min_t(size_t, count,
 				 PAGE_SIZE - offset_in_page(pos));
 		struct page *page;
-		void *fsdata;
+		void *fsdata = NULL;
 		int res;
 
 		res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata);
-- 
cgit 


From b40ebaf63851b3a401b0dc9263843538f64f5ce6 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 21 Nov 2022 14:09:29 +0100
Subject: ext4: avoid BUG_ON when creating xattrs

Commit fb0a387dcdcd ("ext4: limit block allocations for indirect-block
files to < 2^32") added code to try to allocate xattr block with 32-bit
block number for indirect block based files on the grounds that these
files cannot use larger block numbers. It also added BUG_ON when
allocated block could not fit into 32 bits. This is however bogus
reasoning because xattr block is stored in inode->i_file_acl and
inode->i_file_acl_hi and as such even indirect block based files can
happily use full 48 bits for xattr block number. The proper handling
seems to be there basically since 64-bit block number support was added.
So remove the bogus limitation and BUG_ON.

Cc: Eric Sandeen <sandeen@redhat.com>
Fixes: fb0a387dcdcd ("ext4: limit block allocations for indirect-block files to < 2^32")
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20221121130929.32031-1-jack@suse.cz
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/xattr.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 718ef3987f94..4d1c701f0eec 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -2071,19 +2071,11 @@ inserted:
 
 			goal = ext4_group_first_block_no(sb,
 						EXT4_I(inode)->i_block_group);
-
-			/* non-extent files can't have physical blocks past 2^32 */
-			if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
-				goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
-
 			block = ext4_new_meta_blocks(handle, inode, goal, 0,
 						     NULL, &error);
 			if (error)
 				goto cleanup;
 
-			if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
-				BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
-
 			ea_idebug(inode, "creating block %llu",
 				  (unsigned long long)block);
 
-- 
cgit 


From a44e84a9b7764c72896f7241a0ec9ac7e7ef38dd Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 23 Nov 2022 20:39:50 +0100
Subject: ext4: fix deadlock due to mbcache entry corruption

When manipulating xattr blocks, we can deadlock infinitely looping
inside ext4_xattr_block_set() where we constantly keep finding xattr
block for reuse in mbcache but we are unable to reuse it because its
reference count is too big. This happens because cache entry for the
xattr block is marked as reusable (e_reusable set) although its
reference count is too big. When this inconsistency happens, this
inconsistent state is kept indefinitely and so ext4_xattr_block_set()
keeps retrying indefinitely.

The inconsistent state is caused by non-atomic update of e_reusable bit.
e_reusable is part of a bitfield and e_reusable update can race with
update of e_referenced bit in the same bitfield resulting in loss of one
of the updates. Fix the problem by using atomic bitops instead.

This bug has been around for many years, but it became *much* easier
to hit after commit 65f8b80053a1 ("ext4: fix race when reusing xattr
blocks").

Cc: stable@vger.kernel.org
Fixes: 6048c64b2609 ("mbcache: add reusable flag to cache entries")
Fixes: 65f8b80053a1 ("ext4: fix race when reusing xattr blocks")
Reported-and-tested-by: Jeremi Piotrowski <jpiotrowski@linux.microsoft.com>
Reported-by: Thilo Fromm <t-lo@linux.microsoft.com>
Link: https://lore.kernel.org/r/c77bf00f-4618-7149-56f1-b8d1664b9d07@linux.microsoft.com/
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Link: https://lore.kernel.org/r/20221123193950.16758-1-jack@suse.cz
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/xattr.c |  4 ++--
 fs/mbcache.c    | 14 ++++++++------
 2 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 4d1c701f0eec..6bdd502527f8 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1281,7 +1281,7 @@ retry_ref:
 				ce = mb_cache_entry_get(ea_block_cache, hash,
 							bh->b_blocknr);
 				if (ce) {
-					ce->e_reusable = 1;
+					set_bit(MBE_REUSABLE_B, &ce->e_flags);
 					mb_cache_entry_put(ea_block_cache, ce);
 				}
 			}
@@ -2043,7 +2043,7 @@ inserted:
 				}
 				BHDR(new_bh)->h_refcount = cpu_to_le32(ref);
 				if (ref == EXT4_XATTR_REFCOUNT_MAX)
-					ce->e_reusable = 0;
+					clear_bit(MBE_REUSABLE_B, &ce->e_flags);
 				ea_bdebug(new_bh, "reusing; refcount now=%d",
 					  ref);
 				ext4_xattr_block_csum_set(inode, new_bh);
diff --git a/fs/mbcache.c b/fs/mbcache.c
index e272ad738faf..2a4b8b549e93 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -100,8 +100,9 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
 	atomic_set(&entry->e_refcnt, 2);
 	entry->e_key = key;
 	entry->e_value = value;
-	entry->e_reusable = reusable;
-	entry->e_referenced = 0;
+	entry->e_flags = 0;
+	if (reusable)
+		set_bit(MBE_REUSABLE_B, &entry->e_flags);
 	head = mb_cache_entry_head(cache, key);
 	hlist_bl_lock(head);
 	hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) {
@@ -165,7 +166,8 @@ static struct mb_cache_entry *__entry_find(struct mb_cache *cache,
 	while (node) {
 		entry = hlist_bl_entry(node, struct mb_cache_entry,
 				       e_hash_list);
-		if (entry->e_key == key && entry->e_reusable &&
+		if (entry->e_key == key &&
+		    test_bit(MBE_REUSABLE_B, &entry->e_flags) &&
 		    atomic_inc_not_zero(&entry->e_refcnt))
 			goto out;
 		node = node->next;
@@ -284,7 +286,7 @@ EXPORT_SYMBOL(mb_cache_entry_delete_or_get);
 void mb_cache_entry_touch(struct mb_cache *cache,
 			  struct mb_cache_entry *entry)
 {
-	entry->e_referenced = 1;
+	set_bit(MBE_REFERENCED_B, &entry->e_flags);
 }
 EXPORT_SYMBOL(mb_cache_entry_touch);
 
@@ -309,9 +311,9 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache,
 		entry = list_first_entry(&cache->c_list,
 					 struct mb_cache_entry, e_list);
 		/* Drop initial hash reference if there is no user */
-		if (entry->e_referenced ||
+		if (test_bit(MBE_REFERENCED_B, &entry->e_flags) ||
 		    atomic_cmpxchg(&entry->e_refcnt, 1, 0) != 1) {
-			entry->e_referenced = 0;
+			clear_bit(MBE_REFERENCED_B, &entry->e_flags);
 			list_move_tail(&entry->e_list, &cache->c_list);
 			continue;
 		}
-- 
cgit 


From d73eff68a8c0ea3fc626b08ea84e4cd327ccbe5f Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <guoqing.jiang@linux.dev>
Date: Fri, 2 Dec 2022 20:04:09 +0800
Subject: ext4: make ext4_mb_initialize_context return void

Change the return type to void since it always return 0, and no need
to do the checking in ext4_mb_new_blocks.

Signed-off-by: Guoqing Jiang <guoqing.jiang@linux.dev>
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Link: https://lore.kernel.org/r/20221202120409.24098-1-guoqing.jiang@linux.dev
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 9dad93059945..5b2ae37a8b80 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -5204,7 +5204,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
 	mutex_lock(&ac->ac_lg->lg_mutex);
 }
 
-static noinline_for_stack int
+static noinline_for_stack void
 ext4_mb_initialize_context(struct ext4_allocation_context *ac,
 				struct ext4_allocation_request *ar)
 {
@@ -5253,8 +5253,6 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
 			(unsigned) ar->lleft, (unsigned) ar->pleft,
 			(unsigned) ar->lright, (unsigned) ar->pright,
 			inode_is_open_for_write(ar->inode) ? "" : "non-");
-	return 0;
-
 }
 
 static noinline_for_stack void
@@ -5591,11 +5589,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 		goto out;
 	}
 
-	*errp = ext4_mb_initialize_context(ac, ar);
-	if (*errp) {
-		ar->len = 0;
-		goto out;
-	}
+	ext4_mb_initialize_context(ac, ar);
 
 	ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
 	seq = this_cpu_read(discard_pa_seq);
-- 
cgit 


From 5c099c4fdc438014d5893629e70a8ba934433ee8 Mon Sep 17 00:00:00 2001
From: Ye Bin <yebin10@huawei.com>
Date: Tue, 6 Dec 2022 22:41:34 +0800
Subject: ext4: fix kernel BUG in 'ext4_write_inline_data_end()'

Syzbot report follow issue:
------------[ cut here ]------------
kernel BUG at fs/ext4/inline.c:227!
invalid opcode: 0000 [#1] PREEMPT SMP KASAN
CPU: 1 PID: 3629 Comm: syz-executor212 Not tainted 6.1.0-rc5-syzkaller-00018-g59d0d52c30d4 #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/26/2022
RIP: 0010:ext4_write_inline_data+0x344/0x3e0 fs/ext4/inline.c:227
RSP: 0018:ffffc90003b3f368 EFLAGS: 00010293
RAX: 0000000000000000 RBX: ffff8880704e16c0 RCX: 0000000000000000
RDX: ffff888021763a80 RSI: ffffffff821e31a4 RDI: 0000000000000006
RBP: 000000000006818e R08: 0000000000000006 R09: 0000000000068199
R10: 0000000000000079 R11: 0000000000000000 R12: 000000000000000b
R13: 0000000000068199 R14: ffffc90003b3f408 R15: ffff8880704e1c82
FS:  000055555723e3c0(0000) GS:ffff8880b9b00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fffe8ac9080 CR3: 0000000079f81000 CR4: 0000000000350ee0
Call Trace:
 <TASK>
 ext4_write_inline_data_end+0x2a3/0x12f0 fs/ext4/inline.c:768
 ext4_write_end+0x242/0xdd0 fs/ext4/inode.c:1313
 ext4_da_write_end+0x3ed/0xa30 fs/ext4/inode.c:3063
 generic_perform_write+0x316/0x570 mm/filemap.c:3764
 ext4_buffered_write_iter+0x15b/0x460 fs/ext4/file.c:285
 ext4_file_write_iter+0x8bc/0x16e0 fs/ext4/file.c:700
 call_write_iter include/linux/fs.h:2191 [inline]
 do_iter_readv_writev+0x20b/0x3b0 fs/read_write.c:735
 do_iter_write+0x182/0x700 fs/read_write.c:861
 vfs_iter_write+0x74/0xa0 fs/read_write.c:902
 iter_file_splice_write+0x745/0xc90 fs/splice.c:686
 do_splice_from fs/splice.c:764 [inline]
 direct_splice_actor+0x114/0x180 fs/splice.c:931
 splice_direct_to_actor+0x335/0x8a0 fs/splice.c:886
 do_splice_direct+0x1ab/0x280 fs/splice.c:974
 do_sendfile+0xb19/0x1270 fs/read_write.c:1255
 __do_sys_sendfile64 fs/read_write.c:1323 [inline]
 __se_sys_sendfile64 fs/read_write.c:1309 [inline]
 __x64_sys_sendfile64+0x1d0/0x210 fs/read_write.c:1309
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x39/0xb0 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x63/0xcd
---[ end trace 0000000000000000 ]---

Above issue may happens as follows:
ext4_da_write_begin
  ext4_da_write_inline_data_begin
    ext4_da_convert_inline_data_to_extent
      ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
ext4_da_write_end

ext4_run_li_request
  ext4_mb_prefetch
    ext4_read_block_bitmap_nowait
      ext4_validate_block_bitmap
        ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT)
	 percpu_counter_sub(&sbi->s_freeclusters_counter,grp->bb_free);
	  -> sbi->s_freeclusters_counter become zero
ext4_da_write_begin
  if (ext4_nonda_switch(inode->i_sb)) -> As freeclusters_counter is zero will return true
    *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
    ext4_write_begin
ext4_da_write_end
  if (write_mode == FALL_BACK_TO_NONDELALLOC)
    ext4_write_end
      if (inline_data)
        ext4_write_inline_data_end
	  ext4_write_inline_data
	    BUG_ON(pos + len > EXT4_I(inode)->i_inline_size);
           -> As inode is already convert to extent, so 'pos + len' > inline_size
	   -> then trigger BUG.

To solve this issue, instead of checking ext4_has_inline_data() which
is only cleared after data has been written back, check the
EXT4_STATE_MAY_INLINE_DATA flag in ext4_write_end().

Fixes: f19d5870cbf7 ("ext4: add normal write support for inline data")
Reported-by: syzbot+4faa160fa96bfba639f8@syzkaller.appspotmail.com
Reported-by: Jun Nie <jun.nie@linaro.org>
Signed-off-by: Ye Bin <yebin10@huawei.com>
Link: https://lore.kernel.org/r/20221206144134.1919987-1-yebin@huaweicloud.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 181bc161b1ac..a0f4d4197a0b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1315,7 +1315,8 @@ static int ext4_write_end(struct file *file,
 
 	trace_ext4_write_end(inode, pos, len, copied);
 
-	if (ext4_has_inline_data(inode))
+	if (ext4_has_inline_data(inode) &&
+	    ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
 		return ext4_write_inline_data_end(inode, pos, len, copied, page);
 
 	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
-- 
cgit 


From 04e568a3b31cfbd545c04c8bfc35c20e5ccfce0f Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 7 Dec 2022 12:27:04 +0100
Subject: ext4: handle redirtying in ext4_bio_write_page()

Since we want to transition transaction commits to use ext4_writepages()
for writing back ordered, add handling of page redirtying into
ext4_bio_write_page(). Also move buffer dirty bit clearing into the same
place other buffer state handling.

Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20221207112722.22220-1-jack@suse.cz
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/page-io.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 97fa7b4c645f..4e68ace86f11 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -482,6 +482,13 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 			/* A hole? We can safely clear the dirty bit */
 			if (!buffer_mapped(bh))
 				clear_buffer_dirty(bh);
+			/*
+			 * Keeping dirty some buffer we cannot write? Make
+			 * sure to redirty the page. This happens e.g. when
+			 * doing writeout for transaction commit.
+			 */
+			if (buffer_dirty(bh) && !PageDirty(page))
+				redirty_page_for_writepage(wbc, page);
 			if (io->io_bio)
 				ext4_io_submit(io);
 			continue;
@@ -489,6 +496,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 		if (buffer_new(bh))
 			clear_buffer_new(bh);
 		set_buffer_async_write(bh);
+		clear_buffer_dirty(bh);
 		nr_to_submit++;
 	} while ((bh = bh->b_this_page) != head);
 
@@ -532,7 +540,10 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 			printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret);
 			redirty_page_for_writepage(wbc, page);
 			do {
-				clear_buffer_async_write(bh);
+				if (buffer_async_write(bh)) {
+					clear_buffer_async_write(bh);
+					set_buffer_dirty(bh);
+				}
 				bh = bh->b_this_page;
 			} while (bh != head);
 			goto unlock;
@@ -546,7 +557,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 		io_submit_add_bh(io, inode,
 				 bounce_page ? bounce_page : page, bh);
 		nr_submitted++;
-		clear_buffer_dirty(bh);
 	} while ((bh = bh->b_this_page) != head);
 
 unlock:
-- 
cgit 


From dff4ac75eeeefc4397fe7cf8ce559425bf46b1f7 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 7 Dec 2022 12:27:05 +0100
Subject: ext4: move keep_towrite handling to ext4_bio_write_page()

When we are writing back page but we cannot for some reason write all
its buffers (e.g. because we cannot allocate blocks in current context) we
have to keep TOWRITE tag set in the mapping as otherwise racing
WB_SYNC_ALL writeback that could write these buffers can skip the page
and result in data loss.  We will need this logic for writeback during
transaction commit so move the logic from ext4_writepage() to
ext4_bio_write_page().

Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20221207112722.22220-2-jack@suse.cz
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h    |  3 +--
 fs/ext4/inode.c   |  6 ++----
 fs/ext4/page-io.c | 36 +++++++++++++++++++++---------------
 3 files changed, 24 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4e739902dc03..0540929630a1 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3757,8 +3757,7 @@ extern void ext4_end_io_rsv_work(struct work_struct *work);
 extern void ext4_io_submit(struct ext4_io_submit *io);
 extern int ext4_bio_write_page(struct ext4_io_submit *io,
 			       struct page *page,
-			       int len,
-			       bool keep_towrite);
+			       int len);
 extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
 extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a0f4d4197a0b..4c70cc525f10 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2016,7 +2016,6 @@ static int ext4_writepage(struct page *page,
 	struct buffer_head *page_bufs = NULL;
 	struct inode *inode = page->mapping->host;
 	struct ext4_io_submit io_submit;
-	bool keep_towrite = false;
 
 	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) {
 		folio_invalidate(folio, 0, folio_size(folio));
@@ -2074,7 +2073,6 @@ static int ext4_writepage(struct page *page,
 			unlock_page(page);
 			return 0;
 		}
-		keep_towrite = true;
 	}
 
 	if (PageChecked(page) && ext4_should_journal_data(inode))
@@ -2091,7 +2089,7 @@ static int ext4_writepage(struct page *page,
 		unlock_page(page);
 		return -ENOMEM;
 	}
-	ret = ext4_bio_write_page(&io_submit, page, len, keep_towrite);
+	ret = ext4_bio_write_page(&io_submit, page, len);
 	ext4_io_submit(&io_submit);
 	/* Drop io_end reference we got from init */
 	ext4_put_io_end_defer(io_submit.io_end);
@@ -2125,7 +2123,7 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
 		len = size & ~PAGE_MASK;
 	else
 		len = PAGE_SIZE;
-	err = ext4_bio_write_page(&mpd->io_submit, page, len, false);
+	err = ext4_bio_write_page(&mpd->io_submit, page, len);
 	if (!err)
 		mpd->wbc->nr_to_write--;
 	mpd->first_page++;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 4e68ace86f11..4f9ecacd10aa 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -430,8 +430,7 @@ submit_and_retry:
 
 int ext4_bio_write_page(struct ext4_io_submit *io,
 			struct page *page,
-			int len,
-			bool keep_towrite)
+			int len)
 {
 	struct page *bounce_page = NULL;
 	struct inode *inode = page->mapping->host;
@@ -441,14 +440,11 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 	int nr_submitted = 0;
 	int nr_to_submit = 0;
 	struct writeback_control *wbc = io->io_wbc;
+	bool keep_towrite = false;
 
 	BUG_ON(!PageLocked(page));
 	BUG_ON(PageWriteback(page));
 
-	if (keep_towrite)
-		set_page_writeback_keepwrite(page);
-	else
-		set_page_writeback(page);
 	ClearPageError(page);
 
 	/*
@@ -483,12 +479,17 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 			if (!buffer_mapped(bh))
 				clear_buffer_dirty(bh);
 			/*
-			 * Keeping dirty some buffer we cannot write? Make
-			 * sure to redirty the page. This happens e.g. when
-			 * doing writeout for transaction commit.
+			 * Keeping dirty some buffer we cannot write? Make sure
+			 * to redirty the page and keep TOWRITE tag so that
+			 * racing WB_SYNC_ALL writeback does not skip the page.
+			 * This happens e.g. when doing writeout for
+			 * transaction commit.
 			 */
-			if (buffer_dirty(bh) && !PageDirty(page))
-				redirty_page_for_writepage(wbc, page);
+			if (buffer_dirty(bh)) {
+				if (!PageDirty(page))
+					redirty_page_for_writepage(wbc, page);
+				keep_towrite = true;
+			}
 			if (io->io_bio)
 				ext4_io_submit(io);
 			continue;
@@ -500,6 +501,10 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 		nr_to_submit++;
 	} while ((bh = bh->b_this_page) != head);
 
+	/* Nothing to submit? Just unlock the page... */
+	if (!nr_to_submit)
+		goto unlock;
+
 	bh = head = page_buffers(page);
 
 	/*
@@ -550,6 +555,11 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 		}
 	}
 
+	if (keep_towrite)
+		set_page_writeback_keepwrite(page);
+	else
+		set_page_writeback(page);
+
 	/* Now submit buffers to write */
 	do {
 		if (!buffer_async_write(bh))
@@ -558,11 +568,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 				 bounce_page ? bounce_page : page, bh);
 		nr_submitted++;
 	} while ((bh = bh->b_this_page) != head);
-
 unlock:
 	unlock_page(page);
-	/* Nothing submitted - we have to end page writeback */
-	if (!nr_submitted)
-		end_page_writeback(page);
 	return ret;
 }
-- 
cgit 


From 29b83c574b0a06f29126e4e18801ba0e5d2e089b Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 7 Dec 2022 12:27:06 +0100
Subject: ext4: remove nr_submitted from ext4_bio_write_page()

nr_submitted is the same as nr_to_submit.  Drop one of them.

Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20221207112722.22220-3-jack@suse.cz
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/page-io.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 4f9ecacd10aa..2bdfb8a046d9 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -437,7 +437,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 	unsigned block_start;
 	struct buffer_head *bh, *head;
 	int ret = 0;
-	int nr_submitted = 0;
 	int nr_to_submit = 0;
 	struct writeback_control *wbc = io->io_wbc;
 	bool keep_towrite = false;
@@ -566,7 +565,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 			continue;
 		io_submit_add_bh(io, inode,
 				 bounce_page ? bounce_page : page, bh);
-		nr_submitted++;
 	} while ((bh = bh->b_this_page) != head);
 unlock:
 	unlock_page(page);
-- 
cgit 


From 5c27088b3bcf82c8525ec55cde7d2ddd034283b6 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 7 Dec 2022 12:27:07 +0100
Subject: ext4: drop pointless IO submission from ext4_bio_write_page()

We submit outstanding IO in ext4_bio_write_page() if we find a buffer we
are not going to write. This is however pointless because we already
handle submission of previous IO in case we detect newly added buffer
head is discontiguous. So just delete the pointless IO submission call.

Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20221207112722.22220-4-jack@suse.cz
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/page-io.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 2bdfb8a046d9..beaec6d81074 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -489,8 +489,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 					redirty_page_for_writepage(wbc, page);
 				keep_towrite = true;
 			}
-			if (io->io_bio)
-				ext4_io_submit(io);
 			continue;
 		}
 		if (buffer_new(bh))
-- 
cgit 


From de0039f69c95692539bcb7cfc3b01f31e1c9f342 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 7 Dec 2022 12:27:08 +0100
Subject: ext4: add support for writepages calls that cannot map blocks

Add support for calls to ext4_writepages() than cannot map blocks. These
will be issued from jbd2 transaction commit code.

Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20221207112722.22220-5-jack@suse.cz
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inode.c | 62 ++++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 48 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4c70cc525f10..186635cb1b59 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1564,6 +1564,7 @@ struct mpage_da_data {
 	struct ext4_map_blocks map;
 	struct ext4_io_submit io_submit;	/* IO submission data */
 	unsigned int do_map:1;
+	unsigned int can_map:1;	/* Can writepages call map blocks? */
 	unsigned int scanned_until_end:1;
 };
 
@@ -2556,18 +2557,33 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
 				MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
 }
 
+/* Return true if the page needs to be written as part of transaction commit */
+static bool ext4_page_nomap_can_writeout(struct page *page)
+{
+	struct buffer_head *bh, *head;
+
+	bh = head = page_buffers(page);
+	do {
+		if (buffer_dirty(bh) && buffer_mapped(bh) && !buffer_delay(bh))
+			return true;
+	} while ((bh = bh->b_this_page) != head);
+	return false;
+}
+
 /*
  * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
- * 				 and underlying extent to map
+ * 				 needing mapping, submit mapped pages
  *
  * @mpd - where to look for pages
  *
  * Walk dirty pages in the mapping. If they are fully mapped, submit them for
- * IO immediately. When we find a page which isn't mapped we start accumulating
- * extent of buffers underlying these pages that needs mapping (formed by
- * either delayed or unwritten buffers). We also lock the pages containing
- * these buffers. The extent found is returned in @mpd structure (starting at
- * mpd->lblk with length mpd->len blocks).
+ * IO immediately. If we cannot map blocks, we submit just already mapped
+ * buffers in the page for IO and keep page dirty. When we can map blocks and
+ * we find a page which isn't mapped we start accumulating extent of buffers
+ * underlying these pages that needs mapping (formed by either delayed or
+ * unwritten buffers). We also lock the pages containing these buffers. The
+ * extent found is returned in @mpd structure (starting at mpd->lblk with
+ * length mpd->len blocks).
  *
  * Note that this function can attach bios to one io_end structure which are
  * neither logically nor physically contiguous. Although it may seem as an
@@ -2658,14 +2674,30 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
 			if (mpd->map.m_len == 0)
 				mpd->first_page = page->index;
 			mpd->next_page = page->index + 1;
-			/* Add all dirty buffers to mpd */
-			lblk = ((ext4_lblk_t)page->index) <<
-				(PAGE_SHIFT - blkbits);
-			head = page_buffers(page);
-			err = mpage_process_page_bufs(mpd, head, head, lblk);
-			if (err <= 0)
-				goto out;
-			err = 0;
+			/*
+			 * Writeout for transaction commit where we cannot
+			 * modify metadata is simple. Just submit the page.
+			 */
+			if (!mpd->can_map) {
+				if (ext4_page_nomap_can_writeout(page)) {
+					err = mpage_submit_page(mpd, page);
+					if (err < 0)
+						goto out;
+				} else {
+					unlock_page(page);
+					mpd->first_page++;
+				}
+			} else {
+				/* Add all dirty buffers to mpd */
+				lblk = ((ext4_lblk_t)page->index) <<
+					(PAGE_SHIFT - blkbits);
+				head = page_buffers(page);
+				err = mpage_process_page_bufs(mpd, head, head,
+							      lblk);
+				if (err <= 0)
+					goto out;
+				err = 0;
+			}
 			left--;
 		}
 		pagevec_release(&pvec);
@@ -2785,6 +2817,7 @@ retry:
 	 */
 	mpd.do_map = 0;
 	mpd.scanned_until_end = 0;
+	mpd.can_map = 1;
 	mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
 	if (!mpd.io_submit.io_end) {
 		ret = -ENOMEM;
@@ -2808,6 +2841,7 @@ retry:
 			break;
 		}
 
+		WARN_ON_ONCE(!mpd->can_map);
 		/*
 		 * We have two constraints: We find one extent to map and we
 		 * must always write out whole page (makes a difference when
-- 
cgit 


From 15648d599cd1c15cc678039dcab65599276fe407 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 7 Dec 2022 12:27:09 +0100
Subject: ext4: provide ext4_do_writepages()

Provide ext4_do_writepages() function that takes mpage_da_data as an
argument and make ext4_writepages() just a simple wrapper around it. No
functional changes.

Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20221207112722.22220-6-jack@suse.cz
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inode.c | 96 ++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 54 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 186635cb1b59..a023cca4b6f6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1550,9 +1550,12 @@ void ext4_da_release_space(struct inode *inode, int to_free)
  */
 
 struct mpage_da_data {
+	/* These are input fields for ext4_do_writepages() */
 	struct inode *inode;
 	struct writeback_control *wbc;
+	unsigned int can_map:1;	/* Can writepages call map blocks? */
 
+	/* These are internal state of ext4_do_writepages() */
 	pgoff_t first_page;	/* The first page to write */
 	pgoff_t next_page;	/* Current page to examine */
 	pgoff_t last_page;	/* Last page to examine */
@@ -1564,7 +1567,6 @@ struct mpage_da_data {
 	struct ext4_map_blocks map;
 	struct ext4_io_submit io_submit;	/* IO submission data */
 	unsigned int do_map:1;
-	unsigned int can_map:1;	/* Can writepages call map blocks? */
 	unsigned int scanned_until_end:1;
 };
 
@@ -2710,16 +2712,16 @@ out:
 	return err;
 }
 
-static int ext4_writepages(struct address_space *mapping,
-			   struct writeback_control *wbc)
+static int ext4_do_writepages(struct mpage_da_data *mpd)
 {
+	struct writeback_control *wbc = mpd->wbc;
 	pgoff_t	writeback_index = 0;
 	long nr_to_write = wbc->nr_to_write;
 	int range_whole = 0;
 	int cycled = 1;
 	handle_t *handle = NULL;
-	struct mpage_da_data mpd;
-	struct inode *inode = mapping->host;
+	struct inode *inode = mpd->inode;
+	struct address_space *mapping = inode->i_mapping;
 	int needed_blocks, rsv_blocks = 0, ret = 0;
 	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
 	struct blk_plug plug;
@@ -2794,19 +2796,18 @@ static int ext4_writepages(struct address_space *mapping,
 		writeback_index = mapping->writeback_index;
 		if (writeback_index)
 			cycled = 0;
-		mpd.first_page = writeback_index;
-		mpd.last_page = -1;
+		mpd->first_page = writeback_index;
+		mpd->last_page = -1;
 	} else {
-		mpd.first_page = wbc->range_start >> PAGE_SHIFT;
-		mpd.last_page = wbc->range_end >> PAGE_SHIFT;
+		mpd->first_page = wbc->range_start >> PAGE_SHIFT;
+		mpd->last_page = wbc->range_end >> PAGE_SHIFT;
 	}
 
-	mpd.inode = inode;
-	mpd.wbc = wbc;
-	ext4_io_submit_init(&mpd.io_submit, wbc);
+	ext4_io_submit_init(&mpd->io_submit, wbc);
 retry:
 	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
-		tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
+		tag_pages_for_writeback(mapping, mpd->first_page,
+					mpd->last_page);
 	blk_start_plug(&plug);
 
 	/*
@@ -2815,28 +2816,27 @@ retry:
 	 * in the block layer on device congestion while having transaction
 	 * started.
 	 */
-	mpd.do_map = 0;
-	mpd.scanned_until_end = 0;
-	mpd.can_map = 1;
-	mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
-	if (!mpd.io_submit.io_end) {
+	mpd->do_map = 0;
+	mpd->scanned_until_end = 0;
+	mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
+	if (!mpd->io_submit.io_end) {
 		ret = -ENOMEM;
 		goto unplug;
 	}
-	ret = mpage_prepare_extent_to_map(&mpd);
+	ret = mpage_prepare_extent_to_map(mpd);
 	/* Unlock pages we didn't use */
-	mpage_release_unused_pages(&mpd, false);
+	mpage_release_unused_pages(mpd, false);
 	/* Submit prepared bio */
-	ext4_io_submit(&mpd.io_submit);
-	ext4_put_io_end_defer(mpd.io_submit.io_end);
-	mpd.io_submit.io_end = NULL;
+	ext4_io_submit(&mpd->io_submit);
+	ext4_put_io_end_defer(mpd->io_submit.io_end);
+	mpd->io_submit.io_end = NULL;
 	if (ret < 0)
 		goto unplug;
 
-	while (!mpd.scanned_until_end && wbc->nr_to_write > 0) {
+	while (!mpd->scanned_until_end && wbc->nr_to_write > 0) {
 		/* For each extent of pages we use new io_end */
-		mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
-		if (!mpd.io_submit.io_end) {
+		mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
+		if (!mpd->io_submit.io_end) {
 			ret = -ENOMEM;
 			break;
 		}
@@ -2861,16 +2861,16 @@ retry:
 			       "%ld pages, ino %lu; err %d", __func__,
 				wbc->nr_to_write, inode->i_ino, ret);
 			/* Release allocated io_end */
-			ext4_put_io_end(mpd.io_submit.io_end);
-			mpd.io_submit.io_end = NULL;
+			ext4_put_io_end(mpd->io_submit.io_end);
+			mpd->io_submit.io_end = NULL;
 			break;
 		}
-		mpd.do_map = 1;
+		mpd->do_map = 1;
 
-		trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
-		ret = mpage_prepare_extent_to_map(&mpd);
-		if (!ret && mpd.map.m_len)
-			ret = mpage_map_and_submit_extent(handle, &mpd,
+		trace_ext4_da_write_pages(inode, mpd->first_page, wbc);
+		ret = mpage_prepare_extent_to_map(mpd);
+		if (!ret && mpd->map.m_len)
+			ret = mpage_map_and_submit_extent(handle, mpd,
 					&give_up_on_write);
 		/*
 		 * Caution: If the handle is synchronous,
@@ -2885,12 +2885,12 @@ retry:
 		if (!ext4_handle_valid(handle) || handle->h_sync == 0) {
 			ext4_journal_stop(handle);
 			handle = NULL;
-			mpd.do_map = 0;
+			mpd->do_map = 0;
 		}
 		/* Unlock pages we didn't use */
-		mpage_release_unused_pages(&mpd, give_up_on_write);
+		mpage_release_unused_pages(mpd, give_up_on_write);
 		/* Submit prepared bio */
-		ext4_io_submit(&mpd.io_submit);
+		ext4_io_submit(&mpd->io_submit);
 
 		/*
 		 * Drop our io_end reference we got from init. We have
@@ -2900,11 +2900,11 @@ retry:
 		 * up doing unwritten extent conversion.
 		 */
 		if (handle) {
-			ext4_put_io_end_defer(mpd.io_submit.io_end);
+			ext4_put_io_end_defer(mpd->io_submit.io_end);
 			ext4_journal_stop(handle);
 		} else
-			ext4_put_io_end(mpd.io_submit.io_end);
-		mpd.io_submit.io_end = NULL;
+			ext4_put_io_end(mpd->io_submit.io_end);
+		mpd->io_submit.io_end = NULL;
 
 		if (ret == -ENOSPC && sbi->s_journal) {
 			/*
@@ -2924,8 +2924,8 @@ unplug:
 	blk_finish_plug(&plug);
 	if (!ret && !cycled && wbc->nr_to_write > 0) {
 		cycled = 1;
-		mpd.last_page = writeback_index - 1;
-		mpd.first_page = 0;
+		mpd->last_page = writeback_index - 1;
+		mpd->first_page = 0;
 		goto retry;
 	}
 
@@ -2935,7 +2935,7 @@ unplug:
 		 * Set the writeback_index so that range_cyclic
 		 * mode will write it back later
 		 */
-		mapping->writeback_index = mpd.first_page;
+		mapping->writeback_index = mpd->first_page;
 
 out_writepages:
 	trace_ext4_writepages_result(inode, wbc, ret,
@@ -2944,6 +2944,18 @@ out_writepages:
 	return ret;
 }
 
+static int ext4_writepages(struct address_space *mapping,
+			   struct writeback_control *wbc)
+{
+	struct mpage_da_data mpd = {
+		.inode = mapping->host,
+		.wbc = wbc,
+		.can_map = 1,
+	};
+
+	return ext4_do_writepages(&mpd);
+}
+
 static int ext4_dax_writepages(struct address_space *mapping,
 			       struct writeback_control *wbc)
 {
-- 
cgit 


From 29bc9cea0e13dc8043c249f5f232b9e0c441ae24 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 7 Dec 2022 12:27:10 +0100
Subject: ext4: move percpu_rwsem protection into ext4_writepages()

Move protection by percpu_rwsem from ext4_do_writepages() to
ext4_writepages(). We will not want to grab this protection during
transaction commits as that would be prone to deadlocks and the
protection is not needed. Move the shutdown state checking as well since
we want to be able to complete commit while the shutdown is in progress.

Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20221207112722.22220-7-jack@suse.cz
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inode.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a023cca4b6f6..3aa01a9d4664 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2727,10 +2727,6 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
 	struct blk_plug plug;
 	bool give_up_on_write = false;
 
-	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
-		return -EIO;
-
-	percpu_down_read(&sbi->s_writepages_rwsem);
 	trace_ext4_writepages(inode, wbc);
 
 	/*
@@ -2940,20 +2936,28 @@ unplug:
 out_writepages:
 	trace_ext4_writepages_result(inode, wbc, ret,
 				     nr_to_write - wbc->nr_to_write);
-	percpu_up_read(&sbi->s_writepages_rwsem);
 	return ret;
 }
 
 static int ext4_writepages(struct address_space *mapping,
 			   struct writeback_control *wbc)
 {
+	struct super_block *sb = mapping->host->i_sb;
 	struct mpage_da_data mpd = {
 		.inode = mapping->host,
 		.wbc = wbc,
 		.can_map = 1,
 	};
+	int ret;
+
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
+		return -EIO;
 
-	return ext4_do_writepages(&mpd);
+	percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem);
+	ret = ext4_do_writepages(&mpd);
+	percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem);
+
+	return ret;
 }
 
 static int ext4_dax_writepages(struct address_space *mapping,
-- 
cgit 


From 59205c8d4eaeeb85cf76eb1cb140283cf900412a Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 7 Dec 2022 12:27:11 +0100
Subject: ext4: switch to using ext4_do_writepages() for ordered data writeout

Use the standard writepages method (ext4_do_writepages()) to perform
writeout of ordered data during journal commit.

Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20221207112722.22220-8-jack@suse.cz
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h  |  1 +
 fs/ext4/inode.c | 16 ++++++++++++++++
 fs/ext4/super.c |  3 +--
 3 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0540929630a1..140e1eb300d1 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3000,6 +3000,7 @@ extern void ext4_set_inode_flags(struct inode *, bool init);
 extern int ext4_alloc_da_blocks(struct inode *inode);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
+extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
 			     loff_t lstart, loff_t lend);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3aa01a9d4664..56f6ef52fe76 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2960,6 +2960,22 @@ static int ext4_writepages(struct address_space *mapping,
 	return ret;
 }
 
+int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode)
+{
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_ALL,
+		.nr_to_write = LONG_MAX,
+		.range_start = jinode->i_dirty_start,
+		.range_end = jinode->i_dirty_end,
+	};
+	struct mpage_da_data mpd = {
+		.inode = jinode->i_vfs_inode,
+		.wbc = &wbc,
+		.can_map = 0,
+	};
+	return ext4_do_writepages(&mpd);
+}
+
 static int ext4_dax_writepages(struct address_space *mapping,
 			       struct writeback_control *wbc)
 {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 28d009151d23..72ead3b56706 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -540,8 +540,7 @@ static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
 	if (ext4_should_journal_data(jinode->i_vfs_inode))
 		ret = ext4_journalled_submit_inode_data_buffers(jinode);
 	else
-		ret = jbd2_journal_submit_inode_data_buffers(jinode);
-
+		ret = ext4_normal_submit_inode_data_buffers(jinode);
 	return ret;
 }
 
-- 
cgit 


From f30ff35f6266993405c8659e48fddc3180692164 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 7 Dec 2022 12:27:12 +0100
Subject: jbd2: switch jbd2_submit_inode_data() to use fs-provided hook for
 data writeout

jbd2_submit_inode_data() hardcoded use of
jbd2_journal_submit_inode_data_buffers() for submission of data pages.
Make it use j_submit_inode_data_buffers hook instead. This effectively
switches ext4 fastcommits to use ext4_writepages() for data writeout
instead of generic_writepages().

Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20221207112722.22220-9-jack@suse.cz
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/fast_commit.c | 2 +-
 fs/jbd2/commit.c      | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 1da85f626116..4594b62f147b 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -981,7 +981,7 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal)
 			finish_wait(&ei->i_fc_wait, &wait);
 		}
 		spin_unlock(&sbi->s_fc_lock);
-		ret = jbd2_submit_inode_data(ei->jinode);
+		ret = jbd2_submit_inode_data(journal, ei->jinode);
 		if (ret)
 			return ret;
 		spin_lock(&sbi->s_fc_lock);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 885a7a6cc53e..4810438b7856 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -207,14 +207,13 @@ int jbd2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
 }
 
 /* Send all the data buffers related to an inode */
-int jbd2_submit_inode_data(struct jbd2_inode *jinode)
+int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode)
 {
-
 	if (!jinode || !(jinode->i_flags & JI_WRITE_DATA))
 		return 0;
 
 	trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
-	return jbd2_journal_submit_inode_data_buffers(jinode);
+	return journal->j_submit_inode_data_buffers(jinode);
 
 }
 EXPORT_SYMBOL(jbd2_submit_inode_data);
-- 
cgit 


From 49977f9762fe0f28ec08bec5ab992a5c9623f44c Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 7 Dec 2022 12:27:13 +0100
Subject: ext4: switch to using write_cache_pages() for data=journal writeout

Instead of using generic_writepages(), let's use write_cache_pages() for
writeout of journalled data. It will allow us to stop providing
.writepage callback. Our data=journal writeback path would benefit from
a larger cleanup and refactoring but that's for a separate cleanup
series.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20221207112722.22220-10-jack@suse.cz
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inode.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 56f6ef52fe76..fd9a335d1411 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2712,6 +2712,12 @@ out:
 	return err;
 }
 
+static int ext4_writepage_cb(struct page *page, struct writeback_control *wbc,
+			     void *data)
+{
+	return ext4_writepage(page, wbc);
+}
+
 static int ext4_do_writepages(struct mpage_da_data *mpd)
 {
 	struct writeback_control *wbc = mpd->wbc;
@@ -2738,7 +2744,9 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
 		goto out_writepages;
 
 	if (ext4_should_journal_data(inode)) {
-		ret = generic_writepages(mapping, wbc);
+		blk_start_plug(&plug);
+		ret = write_cache_pages(mapping, wbc, ext4_writepage_cb, NULL);
+		blk_finish_plug(&plug);
 		goto out_writepages;
 	}
 
-- 
cgit 


From dae999602eeb0c5482bb62df7fda903f1edc0ff3 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 7 Dec 2022 12:27:15 +0100
Subject: ext4: stop providing .writepage hook

Now we don't need .writepage hook for anything anymore. Reclaim is
fine with relying on .writepages to clean pages and we often couldn't
do much from the .writepage callback anyway. We only need to provide
.migrate_folio callback for the ext4_journalled_aops - let's use
buffer_migrate_page_norefs() there so that buffers cannot be modified
under jdb2's hands as that can cause data corruption. For example when
commit code does writeout of transaction buffers in
jbd2_journal_write_metadata_buffer(), we don't hold page lock or have
page writeback bit set or have the buffer locked. So page migration
code would go and happily migrate the page elsewhere while the copy is
running thus corrupting data.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20221207112722.22220-12-jack@suse.cz
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inode.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index fd9a335d1411..38b7db452d37 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3725,7 +3725,6 @@ static int ext4_iomap_swap_activate(struct swap_info_struct *sis,
 static const struct address_space_operations ext4_aops = {
 	.read_folio		= ext4_read_folio,
 	.readahead		= ext4_readahead,
-	.writepage		= ext4_writepage,
 	.writepages		= ext4_writepages,
 	.write_begin		= ext4_write_begin,
 	.write_end		= ext4_write_end,
@@ -3743,7 +3742,6 @@ static const struct address_space_operations ext4_aops = {
 static const struct address_space_operations ext4_journalled_aops = {
 	.read_folio		= ext4_read_folio,
 	.readahead		= ext4_readahead,
-	.writepage		= ext4_writepage,
 	.writepages		= ext4_writepages,
 	.write_begin		= ext4_write_begin,
 	.write_end		= ext4_journalled_write_end,
@@ -3752,6 +3750,7 @@ static const struct address_space_operations ext4_journalled_aops = {
 	.invalidate_folio	= ext4_journalled_invalidate_folio,
 	.release_folio		= ext4_release_folio,
 	.direct_IO		= noop_direct_IO,
+	.migrate_folio		= buffer_migrate_folio_norefs,
 	.is_partially_uptodate  = block_is_partially_uptodate,
 	.error_remove_page	= generic_error_remove_page,
 	.swap_activate		= ext4_iomap_swap_activate,
@@ -3760,7 +3759,6 @@ static const struct address_space_operations ext4_journalled_aops = {
 static const struct address_space_operations ext4_da_aops = {
 	.read_folio		= ext4_read_folio,
 	.readahead		= ext4_readahead,
-	.writepage		= ext4_writepage,
 	.writepages		= ext4_writepages,
 	.write_begin		= ext4_da_write_begin,
 	.write_end		= ext4_da_write_end,
-- 
cgit 


From 1485f726c6dec1a1f85438f2962feaa3d585526f Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 7 Dec 2022 12:59:27 +0100
Subject: ext4: initialize quota before expanding inode in setproject ioctl

Make sure we initialize quotas before possibly expanding inode space
(and thus maybe needing to allocate external xattr block) in
ext4_ioctl_setproject(). This prevents not accounting the necessary
block allocation.

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: stable@kernel.org
Link: https://lore.kernel.org/r/20221207115937.26601-1-jack@suse.cz
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ioctl.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 202953b5db49..8067ccda34e4 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -732,6 +732,10 @@ static int ext4_ioctl_setproject(struct inode *inode, __u32 projid)
 	if (ext4_is_quota_file(inode))
 		return err;
 
+	err = dquot_initialize(inode);
+	if (err)
+		return err;
+
 	err = ext4_get_inode_loc(inode, &iloc);
 	if (err)
 		return err;
@@ -747,10 +751,6 @@ static int ext4_ioctl_setproject(struct inode *inode, __u32 projid)
 		brelse(iloc.bh);
 	}
 
-	err = dquot_initialize(inode);
-	if (err)
-		return err;
-
 	handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
 		EXT4_QUOTA_INIT_BLOCKS(sb) +
 		EXT4_QUOTA_DEL_BLOCKS(sb) + 3);
-- 
cgit 


From 8994d11395f8165b3deca1971946f549f0822630 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 7 Dec 2022 12:59:28 +0100
Subject: ext4: avoid unaccounted block allocation when expanding inode

When expanding inode space in ext4_expand_extra_isize_ea() we may need
to allocate external xattr block. If quota is not initialized for the
inode, the block allocation will not be accounted into quota usage. Make
sure the quota is initialized before we try to expand inode space.

Reported-by: Pengfei Xu <pengfei.xu@intel.com>
Link: https://lore.kernel.org/all/Y5BT+k6xWqthZc1P@xpf.sh.intel.com
Signed-off-by: Jan Kara <jack@suse.cz>
Cc: stable@kernel.org
Link: https://lore.kernel.org/r/20221207115937.26601-2-jack@suse.cz
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inode.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 38b7db452d37..5a92e5186313 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5945,6 +5945,14 @@ static int __ext4_expand_extra_isize(struct inode *inode,
 		return 0;
 	}
 
+	/*
+	 * We may need to allocate external xattr block so we need quotas
+	 * initialized. Here we can be called with various locks held so we
+	 * cannot affort to initialize quotas ourselves. So just bail.
+	 */
+	if (dquot_initialize_needed(inode))
+		return -EAGAIN;
+
 	/* try to expand with EAs present */
 	error = ext4_expand_extra_isize_ea(inode, new_extra_isize,
 					   raw_inode, handle);
-- 
cgit 


From cc12a6f25e07ed05d5825a1664b67a970842b2ca Mon Sep 17 00:00:00 2001
From: Ye Bin <yebin10@huawei.com>
Date: Thu, 8 Dec 2022 10:32:31 +0800
Subject: ext4: allocate extended attribute value in vmalloc area

Now, extended attribute value maximum length is 64K. The memory
requested here does not need continuous physical addresses, so it is
appropriate to use kvmalloc to request memory. At the same time, it
can also cope with the situation that the extended attribute will
become longer in the future.

Signed-off-by: Ye Bin <yebin10@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20221208023233.1231330-3-yebin@huaweicloud.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/xattr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 6bdd502527f8..b666d3bf8b38 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -2548,7 +2548,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
 
 	is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS);
 	bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS);
-	buffer = kmalloc(value_size, GFP_NOFS);
+	buffer = kvmalloc(value_size, GFP_NOFS);
 	b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS);
 	if (!is || !bs || !buffer || !b_entry_name) {
 		error = -ENOMEM;
@@ -2600,7 +2600,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
 	error = 0;
 out:
 	kfree(b_entry_name);
-	kfree(buffer);
+	kvfree(buffer);
 	if (is)
 		brelse(is->iloc.bh);
 	if (bs)
-- 
cgit 


From e4db04f7d3dbbe16680e0ded27ea2a65b10f766a Mon Sep 17 00:00:00 2001
From: Ye Bin <yebin10@huawei.com>
Date: Thu, 8 Dec 2022 10:32:33 +0800
Subject: ext4: fix inode leak in ext4_xattr_inode_create() on an error path

There is issue as follows when do setxattr with inject fault:

[localhost]# fsck.ext4  -fn  /dev/sda
e2fsck 1.46.6-rc1 (12-Sep-2022)
Pass 1: Checking inodes, blocks, and sizes
Pass 2: Checking directory structure
Pass 3: Checking directory connectivity
Pass 4: Checking reference counts
Unattached zero-length inode 15.  Clear? no

Unattached inode 15
Connect to /lost+found? no

Pass 5: Checking group summary information

/dev/sda: ********** WARNING: Filesystem still has errors **********

/dev/sda: 15/655360 files (0.0% non-contiguous), 66755/2621440 blocks

This occurs in 'ext4_xattr_inode_create()'. If 'ext4_mark_inode_dirty()'
fails, dropping i_nlink of the inode is needed. Or will lead to inode leak.

Signed-off-by: Ye Bin <yebin10@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20221208023233.1231330-5-yebin@huaweicloud.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/xattr.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index b666d3bf8b38..7decaaf27e82 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1441,6 +1441,9 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
 		if (!err)
 			err = ext4_inode_attach_jinode(ea_inode);
 		if (err) {
+			if (ext4_xattr_inode_dec_ref(handle, ea_inode))
+				ext4_warning_inode(ea_inode,
+					"cleanup dec ref error %d", err);
 			iput(ea_inode);
 			return ERR_PTR(err);
 		}
-- 
cgit 


From 1da18e38cb97e9521e93d63034521a9649524f64 Mon Sep 17 00:00:00 2001
From: Ye Bin <yebin10@huawei.com>
Date: Thu, 8 Dec 2022 11:34:24 +0800
Subject: ext4: fix reserved cluster accounting in __es_remove_extent()

When bigalloc is enabled, reserved cluster accounting for delayed
allocation is handled in extent_status.c.  With a corrupted file
system, it's possible for this accounting to be incorrect,
dsicovered by Syzbot:

EXT4-fs error (device loop0): ext4_validate_block_bitmap:398: comm rep:
	bg 0: block 5: invalid block bitmap
EXT4-fs (loop0): Delayed block allocation failed for inode 18 at logical
	offset 0 with max blocks 32 with error 28
EXT4-fs (loop0): This should not happen!! Data will be lost

EXT4-fs (loop0): Total free blocks count 0
EXT4-fs (loop0): Free/Dirty block details
EXT4-fs (loop0): free_blocks=0
EXT4-fs (loop0): dirty_blocks=32
EXT4-fs (loop0): Block reservation details
EXT4-fs (loop0): i_reserved_data_blocks=2
EXT4-fs (loop0): Inode 18 (00000000845cd634):
	i_reserved_data_blocks (1) not cleared!

Above issue happens as follows:
Assume:
sbi->s_cluster_ratio = 16
Step1:
Insert delay block [0, 31] -> ei->i_reserved_data_blocks=2
Step2:
ext4_writepages
  mpage_map_and_submit_extent -> return failed
  mpage_release_unused_pages -> to release [0, 30]
    ext4_es_remove_extent -> remove lblk=0 end=30
      __es_remove_extent -> len1=0 len2=31-30=1
 __es_remove_extent:
 ...
 if (len2 > 0) {
  ...
	  if (len1 > 0) {
		  ...
	  } else {
		es->es_lblk = end + 1;
		es->es_len = len2;
		...
	  }
  	if (count_reserved)
		count_rsvd(inode, lblk, ...);
	goto out; -> will return but didn't calculate 'reserved'
 ...
Step3:
ext4_destroy_inode -> trigger "i_reserved_data_blocks (1) not cleared!"

To solve above issue if 'len2>0' call 'get_rsvd()' before goto out.

Reported-by: syzbot+05a0f0ccab4a25626e38@syzkaller.appspotmail.com
Fixes: 8fcc3a580651 ("ext4: rework reserved cluster accounting when invalidating pages")
Signed-off-by: Ye Bin <yebin10@huawei.com>
Reviewed-by: Eric Whitney <enwlinux@gmail.com>
Link: https://lore.kernel.org/r/20221208033426.1832460-2-yebin@huaweicloud.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/extents_status.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 97eccc0028a1..7bc221038c6c 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -1369,7 +1369,7 @@ retry:
 		if (count_reserved)
 			count_rsvd(inode, lblk, orig_es.es_len - len1 - len2,
 				   &orig_es, &rc);
-		goto out;
+		goto out_get_reserved;
 	}
 
 	if (len1 > 0) {
@@ -1411,6 +1411,7 @@ retry:
 		}
 	}
 
+out_get_reserved:
 	if (count_reserved)
 		*reserved = get_rsvd(inode, end, es, &rc);
 out:
-- 
cgit 


From cfe4c1b25dd6d2f056afc00b7c98bcb3dd0b1fc3 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 7 Dec 2022 17:25:10 +0100
Subject: udf: Fix preallocation discarding at indirect extent boundary

When preallocation extent is the first one in the extent block, the
code would corrupt extent tree header instead. Fix the problem and use
udf_delete_aext() for deleting extent to avoid some code duplication.

CC: stable@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/truncate.c | 45 +++++++++++++--------------------------------
 1 file changed, 13 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index 532cda99644e..a9790fb32f5f 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -120,60 +120,41 @@ void udf_truncate_tail_extent(struct inode *inode)
 
 void udf_discard_prealloc(struct inode *inode)
 {
-	struct extent_position epos = { NULL, 0, {0, 0} };
+	struct extent_position epos = {};
+	struct extent_position prev_epos = {};
 	struct kernel_lb_addr eloc;
 	uint32_t elen;
 	uint64_t lbcount = 0;
 	int8_t etype = -1, netype;
-	int adsize;
 	struct udf_inode_info *iinfo = UDF_I(inode);
 
 	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB ||
 	    inode->i_size == iinfo->i_lenExtents)
 		return;
 
-	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-		adsize = sizeof(struct short_ad);
-	else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-		adsize = sizeof(struct long_ad);
-	else
-		adsize = 0;
-
 	epos.block = iinfo->i_location;
 
 	/* Find the last extent in the file */
-	while ((netype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) {
-		etype = netype;
+	while ((netype = udf_next_aext(inode, &epos, &eloc, &elen, 0)) != -1) {
+		brelse(prev_epos.bh);
+		prev_epos = epos;
+		if (prev_epos.bh)
+			get_bh(prev_epos.bh);
+
+		etype = udf_next_aext(inode, &epos, &eloc, &elen, 1);
 		lbcount += elen;
 	}
 	if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
-		epos.offset -= adsize;
 		lbcount -= elen;
-		extent_trunc(inode, &epos, &eloc, etype, elen, 0);
-		if (!epos.bh) {
-			iinfo->i_lenAlloc =
-				epos.offset -
-				udf_file_entry_alloc_offset(inode);
-			mark_inode_dirty(inode);
-		} else {
-			struct allocExtDesc *aed =
-				(struct allocExtDesc *)(epos.bh->b_data);
-			aed->lengthAllocDescs =
-				cpu_to_le32(epos.offset -
-					    sizeof(struct allocExtDesc));
-			if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) ||
-			    UDF_SB(inode->i_sb)->s_udfrev >= 0x0201)
-				udf_update_tag(epos.bh->b_data, epos.offset);
-			else
-				udf_update_tag(epos.bh->b_data,
-					       sizeof(struct allocExtDesc));
-			mark_buffer_dirty_inode(epos.bh, inode);
-		}
+		udf_delete_aext(inode, prev_epos);
+		udf_free_blocks(inode->i_sb, inode, &eloc, 0,
+				DIV_ROUND_UP(elen, 1 << inode->i_blkbits));
 	}
 	/* This inode entry is in-memory only and thus we don't have to mark
 	 * the inode dirty */
 	iinfo->i_lenExtents = lbcount;
 	brelse(epos.bh);
+	brelse(prev_epos.bh);
 }
 
 static void udf_update_alloc_ext_desc(struct inode *inode,
-- 
cgit 


From 6ad53f0f71c52871202a7bf096feb2c59db33fc5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 7 Dec 2022 17:34:33 +0100
Subject: udf: Do not bother looking for prealloc extents if i_lenExtents
 matches i_size

If rounded block-rounded i_lenExtents matches block rounded i_size,
there are no preallocation extents. Do not bother walking extent linked
list.

CC: stable@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/truncate.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index a9790fb32f5f..036ebd892b85 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -127,9 +127,10 @@ void udf_discard_prealloc(struct inode *inode)
 	uint64_t lbcount = 0;
 	int8_t etype = -1, netype;
 	struct udf_inode_info *iinfo = UDF_I(inode);
+	int bsize = 1 << inode->i_blkbits;
 
 	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB ||
-	    inode->i_size == iinfo->i_lenExtents)
+	    ALIGN(inode->i_size, bsize) == ALIGN(iinfo->i_lenExtents, bsize))
 		return;
 
 	epos.block = iinfo->i_location;
-- 
cgit 


From 16d0556568148bdcaa45d077cac9f8f7077cf70a Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 7 Dec 2022 18:17:34 +0100
Subject: udf: Discard preallocation before extending file with a hole

When extending file with a hole, we tried to preserve existing
preallocation for the file. However that is not very useful and
complicates code because the previous extent may need to be rounded to
block boundary as well (which we forgot to do thus causing data
corruption for sequence like:

xfs_io -f -c "pwrite 0x75e63 11008" -c "truncate 0x7b24b" \
  -c "truncate 0xabaa3" -c "pwrite 0xac70b 22954" \
  -c "pwrite 0x93a43 11358" -c "pwrite 0xb8e65 52211" file

with 512-byte block size. Just discard preallocation before extending
file to simplify things and also fix this data corruption.

CC: stable@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/inode.c | 46 ++++++++++++++++++----------------------------
 1 file changed, 18 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 0246b1b86fb9..44988e4c3fb2 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -434,6 +434,12 @@ static int udf_get_block(struct inode *inode, sector_t block,
 		iinfo->i_next_alloc_goal++;
 	}
 
+	/*
+	 * Block beyond EOF and prealloc extents? Just discard preallocation
+	 * as it is not useful and complicates things.
+	 */
+	if (((loff_t)block) << inode->i_blkbits > iinfo->i_lenExtents)
+		udf_discard_prealloc(inode);
 	udf_clear_extent_cache(inode);
 	phys = inode_getblk(inode, block, &err, &new);
 	if (!phys)
@@ -483,8 +489,6 @@ static int udf_do_extend_file(struct inode *inode,
 	uint32_t add;
 	int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
 	struct super_block *sb = inode->i_sb;
-	struct kernel_lb_addr prealloc_loc = {};
-	uint32_t prealloc_len = 0;
 	struct udf_inode_info *iinfo;
 	int err;
 
@@ -505,19 +509,6 @@ static int udf_do_extend_file(struct inode *inode,
 			~(sb->s_blocksize - 1);
 	}
 
-	/* Last extent are just preallocated blocks? */
-	if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) ==
-						EXT_NOT_RECORDED_ALLOCATED) {
-		/* Save the extent so that we can reattach it to the end */
-		prealloc_loc = last_ext->extLocation;
-		prealloc_len = last_ext->extLength;
-		/* Mark the extent as a hole */
-		last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
-			(last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
-		last_ext->extLocation.logicalBlockNum = 0;
-		last_ext->extLocation.partitionReferenceNum = 0;
-	}
-
 	/* Can we merge with the previous extent? */
 	if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) ==
 					EXT_NOT_RECORDED_NOT_ALLOCATED) {
@@ -545,7 +536,7 @@ static int udf_do_extend_file(struct inode *inode,
 		 * more extents, we may need to enter possible following
 		 * empty indirect extent.
 		 */
-		if (new_block_bytes || prealloc_len)
+		if (new_block_bytes)
 			udf_next_aext(inode, last_pos, &tmploc, &tmplen, 0);
 	}
 
@@ -579,17 +570,6 @@ static int udf_do_extend_file(struct inode *inode,
 	}
 
 out:
-	/* Do we have some preallocated blocks saved? */
-	if (prealloc_len) {
-		err = udf_add_aext(inode, last_pos, &prealloc_loc,
-				   prealloc_len, 1);
-		if (err)
-			return err;
-		last_ext->extLocation = prealloc_loc;
-		last_ext->extLength = prealloc_len;
-		count++;
-	}
-
 	/* last_pos should point to the last written extent... */
 	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
 		last_pos->offset -= sizeof(struct short_ad);
@@ -642,8 +622,17 @@ static int udf_extend_file(struct inode *inode, loff_t newsize)
 	else
 		BUG();
 
+	/*
+	 * When creating hole in file, just don't bother with preserving
+	 * preallocation. It likely won't be very useful anyway.
+	 */
+	udf_discard_prealloc(inode);
+
 	etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset);
 	within_final_block = (etype != -1);
+	/* We don't expect extents past EOF... */
+	WARN_ON_ONCE(etype != -1 &&
+		     elen > ((loff_t)offset + 1) << inode->i_blkbits);
 
 	if ((!epos.bh && epos.offset == udf_file_entry_alloc_offset(inode)) ||
 	    (epos.bh && epos.offset == sizeof(struct allocExtDesc))) {
@@ -772,10 +761,11 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
 		goto out_free;
 	}
 
-	/* Are we beyond EOF? */
+	/* Are we beyond EOF and preallocated extent? */
 	if (etype == -1) {
 		int ret;
 		loff_t hole_len;
+
 		isBeyondEOF = true;
 		if (count) {
 			if (c)
-- 
cgit 


From 1f3868f06855c97a4954c99b36f3fc9eb8f60326 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 8 Dec 2022 13:03:30 +0100
Subject: udf: Fix extending file within last block

When extending file within last block it can happen that the extent is
already rounded to the blocksize and thus contains the offset we want to
grow up to. In such case we would mistakenly expand the last extent and
make it one block longer than it should be, exposing unallocated block
in a file and causing data corruption. Fix the problem by properly
detecting this case and bailing out.

CC: stable@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/inode.c | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 44988e4c3fb2..1d7c2a812fc1 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -585,13 +585,17 @@ out:
 static void udf_do_extend_final_block(struct inode *inode,
 				      struct extent_position *last_pos,
 				      struct kernel_long_ad *last_ext,
-				      uint32_t final_block_len)
+				      uint32_t new_elen)
 {
-	struct super_block *sb = inode->i_sb;
 	uint32_t added_bytes;
 
-	added_bytes = final_block_len -
-		      (last_ext->extLength & (sb->s_blocksize - 1));
+	/*
+	 * Extent already large enough? It may be already rounded up to block
+	 * size...
+	 */
+	if (new_elen <= (last_ext->extLength & UDF_EXTENT_LENGTH_MASK))
+		return;
+	added_bytes = (last_ext->extLength & UDF_EXTENT_LENGTH_MASK) - new_elen;
 	last_ext->extLength += added_bytes;
 	UDF_I(inode)->i_lenExtents += added_bytes;
 
@@ -608,12 +612,12 @@ static int udf_extend_file(struct inode *inode, loff_t newsize)
 	int8_t etype;
 	struct super_block *sb = inode->i_sb;
 	sector_t first_block = newsize >> sb->s_blocksize_bits, offset;
-	unsigned long partial_final_block;
+	loff_t new_elen;
 	int adsize;
 	struct udf_inode_info *iinfo = UDF_I(inode);
 	struct kernel_long_ad extent;
 	int err = 0;
-	int within_final_block;
+	bool within_last_ext;
 
 	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
 		adsize = sizeof(struct short_ad);
@@ -629,9 +633,9 @@ static int udf_extend_file(struct inode *inode, loff_t newsize)
 	udf_discard_prealloc(inode);
 
 	etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset);
-	within_final_block = (etype != -1);
+	within_last_ext = (etype != -1);
 	/* We don't expect extents past EOF... */
-	WARN_ON_ONCE(etype != -1 &&
+	WARN_ON_ONCE(within_last_ext &&
 		     elen > ((loff_t)offset + 1) << inode->i_blkbits);
 
 	if ((!epos.bh && epos.offset == udf_file_entry_alloc_offset(inode)) ||
@@ -648,19 +652,17 @@ static int udf_extend_file(struct inode *inode, loff_t newsize)
 		extent.extLength |= etype << 30;
 	}
 
-	partial_final_block = newsize & (sb->s_blocksize - 1);
+	new_elen = ((loff_t)offset << inode->i_blkbits) |
+					(newsize & (sb->s_blocksize - 1));
 
 	/* File has extent covering the new size (could happen when extending
 	 * inside a block)?
 	 */
-	if (within_final_block) {
+	if (within_last_ext) {
 		/* Extending file within the last file block */
-		udf_do_extend_final_block(inode, &epos, &extent,
-					  partial_final_block);
+		udf_do_extend_final_block(inode, &epos, &extent, new_elen);
 	} else {
-		loff_t add = ((loff_t)offset << sb->s_blocksize_bits) |
-			     partial_final_block;
-		err = udf_do_extend_file(inode, &epos, &extent, add);
+		err = udf_do_extend_file(inode, &epos, &extent, new_elen);
 	}
 
 	if (err < 0)
-- 
cgit 


From 22ae4c114f77b55a4c5036e8f70409a0799a08f8 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 2 Nov 2022 14:44:50 -0400
Subject: nfsd: fix up the filecache laundrette scheduling

We don't really care whether there are hashed entries when it comes to
scheduling the laundrette. They might all be non-gc entries, after all.
We only want to schedule it if there are entries on the LRU.

Switch to using list_lru_count, and move the check into
nfsd_file_gc_worker. The other callsite in nfsd_file_put doesn't need to
count entries, since it only schedules the laundrette after adding an
entry to the LRU.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/filecache.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 1418bc889aa0..da7c9cb12403 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -211,12 +211,9 @@ static const struct rhashtable_params nfsd_file_rhash_params = {
 static void
 nfsd_file_schedule_laundrette(void)
 {
-	if ((atomic_read(&nfsd_file_rhash_tbl.nelems) == 0) ||
-	    test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 0)
-		return;
-
-	queue_delayed_work(system_wq, &nfsd_filecache_laundrette,
-			NFSD_LAUNDRETTE_DELAY);
+	if (test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags))
+		queue_delayed_work(system_wq, &nfsd_filecache_laundrette,
+				   NFSD_LAUNDRETTE_DELAY);
 }
 
 static void
@@ -666,7 +663,8 @@ static void
 nfsd_file_gc_worker(struct work_struct *work)
 {
 	nfsd_file_gc();
-	nfsd_file_schedule_laundrette();
+	if (list_lru_count(&nfsd_file_lru))
+		nfsd_file_schedule_laundrette();
 }
 
 static unsigned long
-- 
cgit 


From d7064eaf688cfe454c50db9f59298463d80d403c Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 3 Nov 2022 16:22:48 -0400
Subject: NFSD: Add an nfsd_file_fsync tracepoint

Add a tracepoint to capture the number of filecache-triggered fsync
calls and which files needed it. Also, record when an fsync triggers
a write verifier reset.

Examples:

<...>-97    [007]   262.505611: nfsd_file_free:       inode=0xffff888171e08140 ref=0 flags=GC may=WRITE nf_file=0xffff8881373d2400
<...>-97    [007]   262.505612: nfsd_file_fsync:      inode=0xffff888171e08140 ref=0 flags=GC may=WRITE nf_file=0xffff8881373d2400 ret=0
<...>-97    [007]   262.505623: nfsd_file_free:       inode=0xffff888171e08dc0 ref=0 flags=GC may=WRITE nf_file=0xffff8881373d1e00
<...>-97    [007]   262.505624: nfsd_file_fsync:      inode=0xffff888171e08dc0 ref=0 flags=GC may=WRITE nf_file=0xffff8881373d1e00 ret=0

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/filecache.c |  5 ++++-
 fs/nfsd/trace.h     | 31 +++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index da7c9cb12403..1998b4d5f692 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -336,10 +336,13 @@ static void
 nfsd_file_fsync(struct nfsd_file *nf)
 {
 	struct file *file = nf->nf_file;
+	int ret;
 
 	if (!file || !(file->f_mode & FMODE_WRITE))
 		return;
-	if (vfs_fsync(file, 1) != 0)
+	ret = vfs_fsync(file, 1);
+	trace_nfsd_file_fsync(nf, ret);
+	if (ret)
 		nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
 }
 
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 0177cfa50556..33567b87cc6b 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -1238,6 +1238,37 @@ DEFINE_EVENT(nfsd_file_lruwalk_class, name,				\
 DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_gc_removed);
 DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_shrinker_removed);
 
+TRACE_EVENT(nfsd_file_fsync,
+	TP_PROTO(
+		const struct nfsd_file *nf,
+		int ret
+	),
+	TP_ARGS(nf, ret),
+	TP_STRUCT__entry(
+		__field(void *, nf_inode)
+		__field(int, nf_ref)
+		__field(int, ret)
+		__field(unsigned long, nf_flags)
+		__field(unsigned char, nf_may)
+		__field(struct file *, nf_file)
+	),
+	TP_fast_assign(
+		__entry->nf_inode = nf->nf_inode;
+		__entry->nf_ref = refcount_read(&nf->nf_ref);
+		__entry->ret = ret;
+		__entry->nf_flags = nf->nf_flags;
+		__entry->nf_may = nf->nf_may;
+		__entry->nf_file = nf->nf_file;
+	),
+	TP_printk("inode=%p ref=%d flags=%s may=%s nf_file=%p ret=%d",
+		__entry->nf_inode,
+		__entry->nf_ref,
+		show_nf_flags(__entry->nf_flags),
+		show_nfsd_may_flags(__entry->nf_may),
+		__entry->nf_file, __entry->ret
+	)
+);
+
 #include "cache.h"
 
 TRACE_DEFINE_ENUM(RC_DROPIT);
-- 
cgit 


From 18ebd35b61b4693a0ddc270b6d4f18def232e770 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sun, 6 Nov 2022 14:02:39 -0500
Subject: lockd: set other missing fields when unlocking files

vfs_lock_file() expects the struct file_lock to be fully initialised by
the caller. Re-exported NFSv3 has been seen to Oops if the fl_file field
is NULL.

Fixes: aec158242b87 ("lockd: set fl_owner when unlocking files")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=216582
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/lockd/svcsubs.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index e1c4617de771..3515f17eaf3f 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -176,7 +176,7 @@ nlm_delete_file(struct nlm_file *file)
 	}
 }
 
-static int nlm_unlock_files(struct nlm_file *file, fl_owner_t owner)
+static int nlm_unlock_files(struct nlm_file *file, const struct file_lock *fl)
 {
 	struct file_lock lock;
 
@@ -184,12 +184,15 @@ static int nlm_unlock_files(struct nlm_file *file, fl_owner_t owner)
 	lock.fl_type  = F_UNLCK;
 	lock.fl_start = 0;
 	lock.fl_end   = OFFSET_MAX;
-	lock.fl_owner = owner;
-	if (file->f_file[O_RDONLY] &&
-	    vfs_lock_file(file->f_file[O_RDONLY], F_SETLK, &lock, NULL))
+	lock.fl_owner = fl->fl_owner;
+	lock.fl_pid   = fl->fl_pid;
+	lock.fl_flags = FL_POSIX;
+
+	lock.fl_file = file->f_file[O_RDONLY];
+	if (lock.fl_file && vfs_lock_file(lock.fl_file, F_SETLK, &lock, NULL))
 		goto out_err;
-	if (file->f_file[O_WRONLY] &&
-	    vfs_lock_file(file->f_file[O_WRONLY], F_SETLK, &lock, NULL))
+	lock.fl_file = file->f_file[O_WRONLY];
+	if (lock.fl_file && vfs_lock_file(lock.fl_file, F_SETLK, &lock, NULL))
 		goto out_err;
 	return 0;
 out_err:
@@ -226,7 +229,7 @@ again:
 		if (match(lockhost, host)) {
 
 			spin_unlock(&flctx->flc_lock);
-			if (nlm_unlock_files(file, fl->fl_owner))
+			if (nlm_unlock_files(file, fl))
 				return 1;
 			goto again;
 		}
-- 
cgit 


From 01d53a88c08951f88f2a42f1f1e6568928e0590e Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 7 Nov 2022 06:58:41 -0500
Subject: nfsd: return error if nfs4_setacl fails

With the addition of POSIX ACLs to struct nfsd_attrs, we no longer
return an error if setting the ACL fails. Ensure we return the na_aclerr
error on SETATTR if there is one.

Fixes: c0cbe70742f4 ("NFSD: add posix ACLs to struct nfsd_attrs")
Cc: Neil Brown <neilb@suse.de>
Reported-by: Yongcheng Yang <yoyang@redhat.com>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4proc.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 6f7e0c5e62d2..a6173677b766 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1135,6 +1135,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 				0, (time64_t)0);
 	if (!status)
 		status = nfserrno(attrs.na_labelerr);
+	if (!status)
+		status = nfserrno(attrs.na_aclerr);
 out:
 	nfsd_attrs_free(&attrs);
 	fh_drop_write(&cstate->current_fh);
-- 
cgit 


From 85a0d0c9a58002ef7d1bf5e3ea630f4fbd42a4f0 Mon Sep 17 00:00:00 2001
From: Xiu Jianfeng <xiujianfeng@huawei.com>
Date: Fri, 11 Nov 2022 17:18:35 +0800
Subject: NFSD: Use struct_size() helper in alloc_session()

Use struct_size() helper to simplify the code, no functional changes.

Signed-off-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 2ec981fd2985..97badca3135e 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1833,13 +1833,12 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
 	int numslots = fattrs->maxreqs;
 	int slotsize = slot_bytes(fattrs);
 	struct nfsd4_session *new;
-	int mem, i;
+	int i;
 
-	BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot *)
-			+ sizeof(struct nfsd4_session) > PAGE_SIZE);
-	mem = numslots * sizeof(struct nfsd4_slot *);
+	BUILD_BUG_ON(struct_size(new, se_slots, NFSD_MAX_SLOTS_PER_SESSION)
+		     > PAGE_SIZE);
 
-	new = kzalloc(sizeof(*new) + mem, GFP_KERNEL);
+	new = kzalloc(struct_size(new, se_slots, numslots), GFP_KERNEL);
 	if (!new)
 		return NULL;
 	/* allocate each struct nfsd4_slot and data cache in one piece */
-- 
cgit 


From 75c7940d2a86d3f1b60a0a265478cb8fc887b970 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 11 Nov 2022 14:36:36 -0500
Subject: lockd: set missing fl_flags field when retrieving args

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/lockd/svc4proc.c | 1 +
 fs/lockd/svcproc.c  | 1 +
 2 files changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 284b019cb652..b72023a6b4c1 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -52,6 +52,7 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
 		*filp = file;
 
 		/* Set up the missing parts of the file_lock structure */
+		lock->fl.fl_flags = FL_POSIX;
 		lock->fl.fl_file  = file->f_file[mode];
 		lock->fl.fl_pid = current->tgid;
 		lock->fl.fl_start = (loff_t)lock->lock_start;
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index e35c05e27806..32784f508c81 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -77,6 +77,7 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
 
 		/* Set up the missing parts of the file_lock structure */
 		mode = lock_to_openmode(&lock->fl);
+		lock->fl.fl_flags = FL_POSIX;
 		lock->fl.fl_file  = file->f_file[mode];
 		lock->fl.fl_pid = current->tgid;
 		lock->fl.fl_lmops = &nlmsvc_lock_operations;
-- 
cgit 


From 69efce009f7df888e1fede3cb2913690eb829f52 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 11 Nov 2022 14:36:37 -0500
Subject: lockd: ensure we use the correct file descriptor when unlocking

Shared locks are set on O_RDONLY descriptors and exclusive locks are set
on O_WRONLY ones. nlmsvc_unlock however calls vfs_lock_file twice, once
for each descriptor, but it doesn't reset fl_file. Ensure that it does.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/lockd/svclock.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 9c1aa75441e1..9eae99e08e69 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -659,11 +659,13 @@ nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock)
 	nlmsvc_cancel_blocked(net, file, lock);
 
 	lock->fl.fl_type = F_UNLCK;
-	if (file->f_file[O_RDONLY])
-		error = vfs_lock_file(file->f_file[O_RDONLY], F_SETLK,
+	lock->fl.fl_file = file->f_file[O_RDONLY];
+	if (lock->fl.fl_file)
+		error = vfs_lock_file(lock->fl.fl_file, F_SETLK,
 					&lock->fl, NULL);
-	if (file->f_file[O_WRONLY])
-		error = vfs_lock_file(file->f_file[O_WRONLY], F_SETLK,
+	lock->fl.fl_file = file->f_file[O_WRONLY];
+	if (lock->fl.fl_file)
+		error |= vfs_lock_file(lock->fl.fl_file, F_SETLK,
 					&lock->fl, NULL);
 
 	return (error < 0)? nlm_lck_denied_nolocks : nlm_granted;
-- 
cgit 


From 9f27783b4dd235ef3c8dbf69fc6322777450323c Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 11 Nov 2022 14:36:38 -0500
Subject: lockd: fix file selection in nlmsvc_cancel_blocked

We currently do a lock_to_openmode call based on the arguments from the
NLM_UNLOCK call, but that will always set the fl_type of the lock to
F_UNLCK, and the O_RDONLY descriptor is always chosen.

Fix it to use the file_lock from the block instead.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/lockd/svclock.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 9eae99e08e69..4e30f3c50970 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -699,9 +699,10 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l
 	block = nlmsvc_lookup_block(file, lock);
 	mutex_unlock(&file->f_mutex);
 	if (block != NULL) {
-		mode = lock_to_openmode(&lock->fl);
-		vfs_cancel_lock(block->b_file->f_file[mode],
-				&block->b_call->a_args.lock.fl);
+		struct file_lock *fl = &block->b_call->a_args.lock.fl;
+
+		mode = lock_to_openmode(fl);
+		vfs_cancel_lock(block->b_file->f_file[mode], fl);
 		status = nlmsvc_unlink_block(block);
 		nlmsvc_release_block(block);
 	}
-- 
cgit 


From 79a1d88a36f77374c77fd41a4386d8c2736b8704 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Wed, 16 Nov 2022 10:28:36 -0500
Subject: NFSD: pass range end to vfs_fsync_range() instead of count

_nfsd_copy_file_range() calls vfs_fsync_range() with an offset and
count (bytes written), but the former wants the start and end bytes
of the range to sync. Fix it up.

Fixes: eac0b17a77fb ("NFSD add vfs_fsync after async copy is done")
Signed-off-by: Brian Foster <bfoster@redhat.com>
Tested-by: Dai Ngo <dai.ngo@oracle.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4proc.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index a6173677b766..73ed32ad23a2 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1642,6 +1642,7 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy,
 	u64 src_pos = copy->cp_src_pos;
 	u64 dst_pos = copy->cp_dst_pos;
 	int status;
+	loff_t end;
 
 	/* See RFC 7862 p.67: */
 	if (bytes_total == 0)
@@ -1661,8 +1662,8 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy,
 	/* for a non-zero asynchronous copy do a commit of data */
 	if (nfsd4_copy_is_async(copy) && copy->cp_res.wr_bytes_written > 0) {
 		since = READ_ONCE(dst->f_wb_err);
-		status = vfs_fsync_range(dst, copy->cp_dst_pos,
-					 copy->cp_res.wr_bytes_written, 0);
+		end = copy->cp_dst_pos + copy->cp_res.wr_bytes_written - 1;
+		status = vfs_fsync_range(dst, copy->cp_dst_pos, end, 0);
 		if (!status)
 			status = filemap_check_wb_err(dst->f_mapping, since);
 		if (!status)
-- 
cgit 


From 247c01ff5f8d66e62a404c91733be52fecb8b7f6 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 14 Nov 2022 08:57:43 -0500
Subject: trace: Relocate event helper files

Steven Rostedt says:
> The include/trace/events/ directory should only hold files that
> are to create events, not headers that hold helper functions.
>
> Can you please move them out of include/trace/events/ as that
> directory is "special" in the creation of events.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Acked-by: Leon Romanovsky <leonro@nvidia.com>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Acked-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/nfs4trace.h | 6 +++---
 fs/nfs/nfstrace.h  | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 2cff5901c689..633cc64a04da 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -9,10 +9,10 @@
 #define _TRACE_NFS4_H
 
 #include <linux/tracepoint.h>
-#include <trace/events/sunrpc_base.h>
+#include <trace/misc/sunrpc.h>
 
-#include <trace/events/fs.h>
-#include <trace/events/nfs.h>
+#include <trace/misc/fs.h>
+#include <trace/misc/nfs.h>
 
 #define show_nfs_fattr_flags(valid) \
 	__print_flags((unsigned long)valid, "|", \
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 8c6cc58679ff..642f6921852f 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -11,9 +11,9 @@
 #include <linux/tracepoint.h>
 #include <linux/iversion.h>
 
-#include <trace/events/fs.h>
-#include <trace/events/nfs.h>
-#include <trace/events/sunrpc_base.h>
+#include <trace/misc/fs.h>
+#include <trace/misc/nfs.h>
+#include <trace/misc/sunrpc.h>
 
 #define nfs_show_cache_validity(v) \
 	__print_flags(v, "|", \
-- 
cgit 


From a1049eb47f20b9eabf9afb218578fff16b4baca6 Mon Sep 17 00:00:00 2001
From: Dai Ngo <dai.ngo@oracle.com>
Date: Wed, 16 Nov 2022 19:44:45 -0800
Subject: NFSD: refactoring courtesy_client_reaper to a generic low memory
 shrinker

Refactoring courtesy_client_reaper to generic low memory
shrinker so it can be used for other purposes.

Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 97badca3135e..00b3fa164fe8 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4361,7 +4361,7 @@ out:
 }
 
 static unsigned long
-nfsd_courtesy_client_count(struct shrinker *shrink, struct shrink_control *sc)
+nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
 {
 	int cnt;
 	struct nfsd_net *nn = container_of(shrink,
@@ -4374,7 +4374,7 @@ nfsd_courtesy_client_count(struct shrinker *shrink, struct shrink_control *sc)
 }
 
 static unsigned long
-nfsd_courtesy_client_scan(struct shrinker *shrink, struct shrink_control *sc)
+nfsd4_state_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc)
 {
 	return SHRINK_STOP;
 }
@@ -4401,8 +4401,8 @@ nfsd4_init_leases_net(struct nfsd_net *nn)
 	nn->nfs4_max_clients = max_t(int, max_clients, NFS4_CLIENTS_PER_GB);
 
 	atomic_set(&nn->nfsd_courtesy_clients, 0);
-	nn->nfsd_client_shrinker.scan_objects = nfsd_courtesy_client_scan;
-	nn->nfsd_client_shrinker.count_objects = nfsd_courtesy_client_count;
+	nn->nfsd_client_shrinker.scan_objects = nfsd4_state_shrinker_scan;
+	nn->nfsd_client_shrinker.count_objects = nfsd4_state_shrinker_count;
 	nn->nfsd_client_shrinker.seeks = DEFAULT_SEEKS;
 	return register_shrinker(&nn->nfsd_client_shrinker, "nfsd-client");
 }
@@ -6151,17 +6151,24 @@ laundromat_main(struct work_struct *laundry)
 }
 
 static void
-courtesy_client_reaper(struct work_struct *reaper)
+courtesy_client_reaper(struct nfsd_net *nn)
 {
 	struct list_head reaplist;
-	struct delayed_work *dwork = to_delayed_work(reaper);
-	struct nfsd_net *nn = container_of(dwork, struct nfsd_net,
-					nfsd_shrinker_work);
 
 	nfs4_get_courtesy_client_reaplist(nn, &reaplist);
 	nfs4_process_client_reaplist(&reaplist);
 }
 
+static void
+nfsd4_state_shrinker_worker(struct work_struct *work)
+{
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct nfsd_net *nn = container_of(dwork, struct nfsd_net,
+				nfsd_shrinker_work);
+
+	courtesy_client_reaper(nn);
+}
+
 static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp)
 {
 	if (!fh_match(&fhp->fh_handle, &stp->sc_file->fi_fhandle))
@@ -7985,7 +7992,7 @@ static int nfs4_state_create_net(struct net *net)
 	INIT_LIST_HEAD(&nn->blocked_locks_lru);
 
 	INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main);
-	INIT_DELAYED_WORK(&nn->nfsd_shrinker_work, courtesy_client_reaper);
+	INIT_DELAYED_WORK(&nn->nfsd_shrinker_work, nfsd4_state_shrinker_worker);
 	get_net(net);
 
 	return 0;
-- 
cgit 


From 3959066b697b5dfbb7141124ae9665337d4bc638 Mon Sep 17 00:00:00 2001
From: Dai Ngo <dai.ngo@oracle.com>
Date: Wed, 16 Nov 2022 19:44:46 -0800
Subject: NFSD: add support for sending CB_RECALL_ANY

Add XDR encode and decode function for CB_RECALL_ANY.

Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4callback.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/state.h        |  1 +
 fs/nfsd/xdr4.h         |  5 ++++
 fs/nfsd/xdr4cb.h       |  6 +++++
 4 files changed, 84 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index f0e69edf5f0f..1b57f2c2f0bb 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -76,6 +76,17 @@ static __be32 *xdr_encode_empty_array(__be32 *p)
  * 1 Protocol"
  */
 
+static void encode_uint32(struct xdr_stream *xdr, u32 n)
+{
+	WARN_ON_ONCE(xdr_stream_encode_u32(xdr, n) < 0);
+}
+
+static void encode_bitmap4(struct xdr_stream *xdr, const __u32 *bitmap,
+			   size_t len)
+{
+	WARN_ON_ONCE(xdr_stream_encode_uint32_array(xdr, bitmap, len) < 0);
+}
+
 /*
  *	nfs_cb_opnum4
  *
@@ -328,6 +339,24 @@ static void encode_cb_recall4args(struct xdr_stream *xdr,
 	hdr->nops++;
 }
 
+/*
+ * CB_RECALLANY4args
+ *
+ *	struct CB_RECALLANY4args {
+ *		uint32_t	craa_objects_to_keep;
+ *		bitmap4		craa_type_mask;
+ *	};
+ */
+static void
+encode_cb_recallany4args(struct xdr_stream *xdr,
+	struct nfs4_cb_compound_hdr *hdr, struct nfsd4_cb_recall_any *ra)
+{
+	encode_nfs_cb_opnum4(xdr, OP_CB_RECALL_ANY);
+	encode_uint32(xdr, ra->ra_keep);
+	encode_bitmap4(xdr, ra->ra_bmval, ARRAY_SIZE(ra->ra_bmval));
+	hdr->nops++;
+}
+
 /*
  * CB_SEQUENCE4args
  *
@@ -482,6 +511,26 @@ static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
 	encode_cb_nops(&hdr);
 }
 
+/*
+ * 20.6. Operation 8: CB_RECALL_ANY - Keep Any N Recallable Objects
+ */
+static void
+nfs4_xdr_enc_cb_recall_any(struct rpc_rqst *req,
+		struct xdr_stream *xdr, const void *data)
+{
+	const struct nfsd4_callback *cb = data;
+	struct nfsd4_cb_recall_any *ra;
+	struct nfs4_cb_compound_hdr hdr = {
+		.ident = cb->cb_clp->cl_cb_ident,
+		.minorversion = cb->cb_clp->cl_minorversion,
+	};
+
+	ra = container_of(cb, struct nfsd4_cb_recall_any, ra_cb);
+	encode_cb_compound4args(xdr, &hdr);
+	encode_cb_sequence4args(xdr, cb, &hdr);
+	encode_cb_recallany4args(xdr, &hdr, ra);
+	encode_cb_nops(&hdr);
+}
 
 /*
  * NFSv4.0 and NFSv4.1 XDR decode functions
@@ -520,6 +569,28 @@ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp,
 	return decode_cb_op_status(xdr, OP_CB_RECALL, &cb->cb_status);
 }
 
+/*
+ * 20.6. Operation 8: CB_RECALL_ANY - Keep Any N Recallable Objects
+ */
+static int
+nfs4_xdr_dec_cb_recall_any(struct rpc_rqst *rqstp,
+				  struct xdr_stream *xdr,
+				  void *data)
+{
+	struct nfsd4_callback *cb = data;
+	struct nfs4_cb_compound_hdr hdr;
+	int status;
+
+	status = decode_cb_compound4res(xdr, &hdr);
+	if (unlikely(status))
+		return status;
+	status = decode_cb_sequence4res(xdr, cb);
+	if (unlikely(status || cb->cb_seq_status))
+		return status;
+	status =  decode_cb_op_status(xdr, OP_CB_RECALL_ANY, &cb->cb_status);
+	return status;
+}
+
 #ifdef CONFIG_NFSD_PNFS
 /*
  * CB_LAYOUTRECALL4args
@@ -783,6 +854,7 @@ static const struct rpc_procinfo nfs4_cb_procedures[] = {
 #endif
 	PROC(CB_NOTIFY_LOCK,	COMPOUND,	cb_notify_lock,	cb_notify_lock),
 	PROC(CB_OFFLOAD,	COMPOUND,	cb_offload,	cb_offload),
+	PROC(CB_RECALL_ANY,	COMPOUND,	cb_recall_any,	cb_recall_any),
 };
 
 static unsigned int nfs4_cb_counts[ARRAY_SIZE(nfs4_cb_procedures)];
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index eadd7f465bf5..e30882f8b851 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -636,6 +636,7 @@ enum nfsd4_cb_op {
 	NFSPROC4_CLNT_CB_OFFLOAD,
 	NFSPROC4_CLNT_CB_SEQUENCE,
 	NFSPROC4_CLNT_CB_NOTIFY_LOCK,
+	NFSPROC4_CLNT_CB_RECALL_ANY,
 };
 
 /* Returns true iff a is later than b: */
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 0eb00105d845..4fd2cf6d1d2d 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -896,5 +896,10 @@ struct nfsd4_operation {
 			union nfsd4_op_u *);
 };
 
+struct nfsd4_cb_recall_any {
+	struct nfsd4_callback	ra_cb;
+	u32			ra_keep;
+	u32			ra_bmval[1];
+};
 
 #endif
diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
index 547cf07cf4e0..0d39af1b00a0 100644
--- a/fs/nfsd/xdr4cb.h
+++ b/fs/nfsd/xdr4cb.h
@@ -48,3 +48,9 @@
 #define NFS4_dec_cb_offload_sz		(cb_compound_dec_hdr_sz  +      \
 					cb_sequence_dec_sz +            \
 					op_dec_sz)
+#define NFS4_enc_cb_recall_any_sz	(cb_compound_enc_hdr_sz +       \
+					cb_sequence_enc_sz +            \
+					1 + 1 + 1)
+#define NFS4_dec_cb_recall_any_sz	(cb_compound_dec_hdr_sz  +      \
+					cb_sequence_dec_sz +            \
+					op_dec_sz)
-- 
cgit 


From 44df6f439a1790a5f602e3842879efa88f346672 Mon Sep 17 00:00:00 2001
From: Dai Ngo <dai.ngo@oracle.com>
Date: Wed, 16 Nov 2022 19:44:47 -0800
Subject: NFSD: add delegation reaper to react to low memory condition

The delegation reaper is called by nfsd memory shrinker's on
the 'count' callback. It scans the client list and sends the
courtesy CB_RECALL_ANY to the clients that hold delegations.

To avoid flooding the clients with CB_RECALL_ANY requests, the
delegation reaper sends only one CB_RECALL_ANY request to each
client per 5 seconds.

Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
[ cel: moved definition of RCA4_TYPE_MASK_RDATA_DLG ]
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 fs/nfsd/state.h     |  5 +++
 2 files changed, 89 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 00b3fa164fe8..c981704ec6a1 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2144,6 +2144,7 @@ static void __free_client(struct kref *k)
 	kfree(clp->cl_nii_domain.data);
 	kfree(clp->cl_nii_name.data);
 	idr_destroy(&clp->cl_stateids);
+	kfree(clp->cl_ra);
 	kmem_cache_free(client_slab, clp);
 }
 
@@ -2871,6 +2872,36 @@ static const struct tree_descr client_files[] = {
 	[3] = {""},
 };
 
+static int
+nfsd4_cb_recall_any_done(struct nfsd4_callback *cb,
+				struct rpc_task *task)
+{
+	switch (task->tk_status) {
+	case -NFS4ERR_DELAY:
+		rpc_delay(task, 2 * HZ);
+		return 0;
+	default:
+		return 1;
+	}
+}
+
+static void
+nfsd4_cb_recall_any_release(struct nfsd4_callback *cb)
+{
+	struct nfs4_client *clp = cb->cb_clp;
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+	spin_lock(&nn->client_lock);
+	clear_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags);
+	put_client_renew_locked(clp);
+	spin_unlock(&nn->client_lock);
+}
+
+static const struct nfsd4_callback_ops nfsd4_cb_recall_any_ops = {
+	.done		= nfsd4_cb_recall_any_done,
+	.release	= nfsd4_cb_recall_any_release,
+};
+
 static struct nfs4_client *create_client(struct xdr_netobj name,
 		struct svc_rqst *rqstp, nfs4_verifier *verf)
 {
@@ -2908,6 +2939,14 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
 		free_client(clp);
 		return NULL;
 	}
+	clp->cl_ra = kzalloc(sizeof(*clp->cl_ra), GFP_KERNEL);
+	if (!clp->cl_ra) {
+		free_client(clp);
+		return NULL;
+	}
+	clp->cl_ra_time = 0;
+	nfsd4_init_cb(&clp->cl_ra->ra_cb, clp, &nfsd4_cb_recall_any_ops,
+			NFSPROC4_CLNT_CB_RECALL_ANY);
 	return clp;
 }
 
@@ -4363,14 +4402,16 @@ out:
 static unsigned long
 nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
 {
-	int cnt;
+	int count;
 	struct nfsd_net *nn = container_of(shrink,
 			struct nfsd_net, nfsd_client_shrinker);
 
-	cnt = atomic_read(&nn->nfsd_courtesy_clients);
-	if (cnt > 0)
+	count = atomic_read(&nn->nfsd_courtesy_clients);
+	if (!count)
+		count = atomic_long_read(&num_delegations);
+	if (count)
 		mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0);
-	return (unsigned long)cnt;
+	return (unsigned long)count;
 }
 
 static unsigned long
@@ -6159,6 +6200,44 @@ courtesy_client_reaper(struct nfsd_net *nn)
 	nfs4_process_client_reaplist(&reaplist);
 }
 
+static void
+deleg_reaper(struct nfsd_net *nn)
+{
+	struct list_head *pos, *next;
+	struct nfs4_client *clp;
+	struct list_head cblist;
+
+	INIT_LIST_HEAD(&cblist);
+	spin_lock(&nn->client_lock);
+	list_for_each_safe(pos, next, &nn->client_lru) {
+		clp = list_entry(pos, struct nfs4_client, cl_lru);
+		if (clp->cl_state != NFSD4_ACTIVE ||
+			list_empty(&clp->cl_delegations) ||
+			atomic_read(&clp->cl_delegs_in_recall) ||
+			test_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags) ||
+			(ktime_get_boottime_seconds() -
+				clp->cl_ra_time < 5)) {
+			continue;
+		}
+		list_add(&clp->cl_ra_cblist, &cblist);
+
+		/* release in nfsd4_cb_recall_any_release */
+		atomic_inc(&clp->cl_rpc_users);
+		set_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags);
+		clp->cl_ra_time = ktime_get_boottime_seconds();
+	}
+	spin_unlock(&nn->client_lock);
+
+	while (!list_empty(&cblist)) {
+		clp = list_first_entry(&cblist, struct nfs4_client,
+					cl_ra_cblist);
+		list_del_init(&clp->cl_ra_cblist);
+		clp->cl_ra->ra_keep = 0;
+		clp->cl_ra->ra_bmval[0] = BIT(RCA4_TYPE_MASK_RDATA_DLG);
+		nfsd4_run_cb(&clp->cl_ra->ra_cb);
+	}
+}
+
 static void
 nfsd4_state_shrinker_worker(struct work_struct *work)
 {
@@ -6167,6 +6246,7 @@ nfsd4_state_shrinker_worker(struct work_struct *work)
 				nfsd_shrinker_work);
 
 	courtesy_client_reaper(nn);
+	deleg_reaper(nn);
 }
 
 static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp)
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index e30882f8b851..e94634d30591 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -368,6 +368,7 @@ struct nfs4_client {
 #define NFSD4_CLIENT_UPCALL_LOCK	(5)	/* upcall serialization */
 #define NFSD4_CLIENT_CB_FLAG_MASK	(1 << NFSD4_CLIENT_CB_UPDATE | \
 					 1 << NFSD4_CLIENT_CB_KILL)
+#define NFSD4_CLIENT_CB_RECALL_ANY	(6)
 	unsigned long		cl_flags;
 	const struct cred	*cl_cb_cred;
 	struct rpc_clnt		*cl_cb_client;
@@ -411,6 +412,10 @@ struct nfs4_client {
 
 	unsigned int		cl_state;
 	atomic_t		cl_delegs_in_recall;
+
+	struct nfsd4_cb_recall_any	*cl_ra;
+	time64_t		cl_ra_time;
+	struct list_head	cl_ra_cblist;
 };
 
 /* struct nfs4_client_reset
-- 
cgit 


From 638593be55c0b37a1930038460a9918215d5c24b Mon Sep 17 00:00:00 2001
From: Dai Ngo <dai.ngo@oracle.com>
Date: Wed, 16 Nov 2022 19:44:48 -0800
Subject: NFSD: add CB_RECALL_ANY tracepoints

Add tracepoints to trace start and end of CB_RECALL_ANY operation.

Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
[ cel: added show_rca_mask() macro ]
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c |  2 ++
 fs/nfsd/trace.h     | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c981704ec6a1..e1e85c21f12b 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2876,6 +2876,7 @@ static int
 nfsd4_cb_recall_any_done(struct nfsd4_callback *cb,
 				struct rpc_task *task)
 {
+	trace_nfsd_cb_recall_any_done(cb, task);
 	switch (task->tk_status) {
 	case -NFS4ERR_DELAY:
 		rpc_delay(task, 2 * HZ);
@@ -6234,6 +6235,7 @@ deleg_reaper(struct nfsd_net *nn)
 		list_del_init(&clp->cl_ra_cblist);
 		clp->cl_ra->ra_keep = 0;
 		clp->cl_ra->ra_bmval[0] = BIT(RCA4_TYPE_MASK_RDATA_DLG);
+		trace_nfsd_cb_recall_any(clp->cl_ra);
 		nfsd4_run_cb(&clp->cl_ra->ra_cb);
 	}
 }
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 33567b87cc6b..46b8f68a2497 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -9,9 +9,12 @@
 #define _NFSD_TRACE_H
 
 #include <linux/tracepoint.h>
+#include <linux/sunrpc/xprt.h>
+#include <trace/misc/nfs.h>
 
 #include "export.h"
 #include "nfsfh.h"
+#include "xdr4.h"
 
 #define NFSD_TRACE_PROC_RES_FIELDS \
 		__field(unsigned int, netns_ino) \
@@ -1562,6 +1565,32 @@ TRACE_EVENT(nfsd_cb_offload,
 		__entry->fh_hash, __entry->count, __entry->status)
 );
 
+TRACE_EVENT(nfsd_cb_recall_any,
+	TP_PROTO(
+		const struct nfsd4_cb_recall_any *ra
+	),
+	TP_ARGS(ra),
+	TP_STRUCT__entry(
+		__field(u32, cl_boot)
+		__field(u32, cl_id)
+		__field(u32, keep)
+		__field(unsigned long, bmval0)
+		__sockaddr(addr, ra->ra_cb.cb_clp->cl_cb_conn.cb_addrlen)
+	),
+	TP_fast_assign(
+		__entry->cl_boot = ra->ra_cb.cb_clp->cl_clientid.cl_boot;
+		__entry->cl_id = ra->ra_cb.cb_clp->cl_clientid.cl_id;
+		__entry->keep = ra->ra_keep;
+		__entry->bmval0 = ra->ra_bmval[0];
+		__assign_sockaddr(addr, &ra->ra_cb.cb_clp->cl_addr,
+				  ra->ra_cb.cb_clp->cl_cb_conn.cb_addrlen);
+	),
+	TP_printk("addr=%pISpc client %08x:%08x keep=%u bmval0=%s",
+		__get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
+		__entry->keep, show_rca_mask(__entry->bmval0)
+	)
+);
+
 DECLARE_EVENT_CLASS(nfsd_cb_done_class,
 	TP_PROTO(
 		const stateid_t *stp,
@@ -1601,6 +1630,27 @@ DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_notify_lock_done);
 DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_layout_done);
 DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_offload_done);
 
+TRACE_EVENT(nfsd_cb_recall_any_done,
+	TP_PROTO(
+		const struct nfsd4_callback *cb,
+		const struct rpc_task *task
+	),
+	TP_ARGS(cb, task),
+	TP_STRUCT__entry(
+		__field(u32, cl_boot)
+		__field(u32, cl_id)
+		__field(int, status)
+	),
+	TP_fast_assign(
+		__entry->status = task->tk_status;
+		__entry->cl_boot = cb->cb_clp->cl_clientid.cl_boot;
+		__entry->cl_id = cb->cb_clp->cl_clientid.cl_id;
+	),
+	TP_printk("client %08x:%08x status=%d",
+		__entry->cl_boot, __entry->cl_id, __entry->status
+	)
+);
+
 #endif /* _NFSD_TRACE_H */
 
 #undef TRACE_INCLUDE_PATH
-- 
cgit 


From 9315564747cb6a570e99196b3a4880fb817635fd Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sat, 26 Nov 2022 15:55:30 -0500
Subject: NFSD: Use only RQ_DROPME to signal the need to drop a reply

Clean up: NFSv2 has the only two usages of rpc_drop_reply in the
NFSD code base. Since NFSv2 is going away at some point, replace
these in order to simplify the "drop this reply?" check in
nfsd_dispatch().

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfsproc.c | 4 ++--
 fs/nfsd/nfssvc.c  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 52fc222c34f2..a5570cf75f3f 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -211,7 +211,7 @@ nfsd_proc_read(struct svc_rqst *rqstp)
 	if (resp->status == nfs_ok)
 		resp->status = fh_getattr(&resp->fh, &resp->stat);
 	else if (resp->status == nfserr_jukebox)
-		return rpc_drop_reply;
+		__set_bit(RQ_DROPME, &rqstp->rq_flags);
 	return rpc_success;
 }
 
@@ -246,7 +246,7 @@ nfsd_proc_write(struct svc_rqst *rqstp)
 	if (resp->status == nfs_ok)
 		resp->status = fh_getattr(&resp->fh, &resp->stat);
 	else if (resp->status == nfserr_jukebox)
-		return rpc_drop_reply;
+		__set_bit(RQ_DROPME, &rqstp->rq_flags);
 	return rpc_success;
 }
 
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 62e473b0ca52..56fba1cba3af 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -1060,7 +1060,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
 	svcxdr_init_encode(rqstp);
 
 	*statp = proc->pc_func(rqstp);
-	if (*statp == rpc_drop_reply || test_bit(RQ_DROPME, &rqstp->rq_flags))
+	if (test_bit(RQ_DROPME, &rqstp->rq_flags))
 		goto out_update_drop;
 
 	if (!proc->pc_encode(rqstp, &rqstp->rq_res_stream))
-- 
cgit 


From e78e274eb22d966258a3845acc71d3c5b8ee2ea8 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 2 Dec 2022 12:48:59 -0800
Subject: NFSD: Avoid clashing function prototypes

When built with Control Flow Integrity, function prototypes between
caller and function declaration must match. These mismatches are visible
at compile time with the new -Wcast-function-type-strict in Clang[1].

There were 97 warnings produced by NFS. For example:

fs/nfsd/nfs4xdr.c:2228:17: warning: cast from '__be32 (*)(struct nfsd4_compoundargs *, struct nfsd4_access *)' (aka 'unsigned int (*)(struct nfsd4_compoundargs *, struct nfsd4_access *)') to 'nfsd4_dec' (aka 'unsigned int (*)(struct nfsd4_compoundargs *, void *)') converts to incompatible function type [-Wcast-function-type-strict]
        [OP_ACCESS]             = (nfsd4_dec)nfsd4_decode_access,
                                  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

The enc/dec callbacks were defined as passing "void *" as the second
argument, but were being implicitly cast to a new type. Replace the
argument with union nfsd4_op_u, and perform explicit member selection
in the function body. There are no resulting binary differences.

Changes were made mechanically using the following Coccinelle script,
with minor by-hand fixes for members that didn't already match their
existing argument name:

@find@
identifier func;
type T, opsT;
identifier ops, N;
@@

 opsT ops[] = {
	[N] = (T) func,
 };

@already_void@
identifier find.func;
identifier name;
@@

 func(...,
-void
+union nfsd4_op_u
 *name)
 {
	...
 }

@proto depends on !already_void@
identifier find.func;
type T;
identifier name;
position p;
@@

 func@p(...,
 	T name
 ) {
	...
   }

@script:python get_member@
type_name << proto.T;
member;
@@

coccinelle.member = cocci.make_ident(type_name.split("_", 1)[1].split(' ',1)[0])

@convert@
identifier find.func;
type proto.T;
identifier proto.name;
position proto.p;
identifier get_member.member;
@@

 func@p(...,
-	T name
+	union nfsd4_op_u *u
 ) {
+	T name = &u->member;
	...
   }

@cast@
identifier find.func;
type T, opsT;
identifier ops, N;
@@

 opsT ops[] = {
	[N] =
-	(T)
	func,
 };

Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Gustavo A. R. Silva <gustavoars@kernel.org>
Cc: linux-nfs@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4xdr.c | 632 ++++++++++++++++++++++++++++++++----------------------
 1 file changed, 377 insertions(+), 255 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 1e4990fb1692..2b4ae858c89b 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -770,16 +770,18 @@ nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_cb_sec *cbs)
 
 static __be32
 nfsd4_decode_access(struct nfsd4_compoundargs *argp,
-		    struct nfsd4_access *access)
+		    union nfsd4_op_u *u)
 {
+	struct nfsd4_access *access = &u->access;
 	if (xdr_stream_decode_u32(argp->xdr, &access->ac_req_access) < 0)
 		return nfserr_bad_xdr;
 	return nfs_ok;
 }
 
 static __be32
-nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
+nfsd4_decode_close(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
 {
+	struct nfsd4_close *close = &u->close;
 	if (xdr_stream_decode_u32(argp->xdr, &close->cl_seqid) < 0)
 		return nfserr_bad_xdr;
 	return nfsd4_decode_stateid4(argp, &close->cl_stateid);
@@ -787,8 +789,9 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
 
 
 static __be32
-nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit)
+nfsd4_decode_commit(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
 {
+	struct nfsd4_commit *commit = &u->commit;
 	if (xdr_stream_decode_u64(argp->xdr, &commit->co_offset) < 0)
 		return nfserr_bad_xdr;
 	if (xdr_stream_decode_u32(argp->xdr, &commit->co_count) < 0)
@@ -798,8 +801,9 @@ nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit
 }
 
 static __be32
-nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create)
+nfsd4_decode_create(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
 {
+	struct nfsd4_create *create = &u->create;
 	__be32 *p, status;
 
 	memset(create, 0, sizeof(*create));
@@ -844,22 +848,25 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
 }
 
 static inline __be32
-nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr)
+nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
 {
+	struct nfsd4_delegreturn *dr = &u->delegreturn;
 	return nfsd4_decode_stateid4(argp, &dr->dr_stateid);
 }
 
 static inline __be32
-nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, struct nfsd4_getattr *getattr)
+nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
 {
+	struct nfsd4_getattr *getattr = &u->getattr;
 	memset(getattr, 0, sizeof(*getattr));
 	return nfsd4_decode_bitmap4(argp, getattr->ga_bmval,
 				    ARRAY_SIZE(getattr->ga_bmval));
 }
 
 static __be32
-nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link)
+nfsd4_decode_link(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
 {
+	struct nfsd4_link *link = &u->link;
 	memset(link, 0, sizeof(*link));
 	return nfsd4_decode_component4(argp, &link->li_name, &link->li_namelen);
 }
@@ -907,8 +914,9 @@ nfsd4_decode_locker4(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
 }
 
 static __be32
-nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
+nfsd4_decode_lock(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
 {
+	struct nfsd4_lock *lock = &u->lock;
 	memset(lock, 0, sizeof(*lock));
 	if (xdr_stream_decode_u32(argp->xdr, &lock->lk_type) < 0)
 		return nfserr_bad_xdr;
@@ -924,8 +932,9 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
 }
 
 static __be32
-nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
+nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
 {
+	struct nfsd4_lockt *lockt = &u->lockt;
 	memset(lockt, 0, sizeof(*lockt));
 	if (xdr_stream_decode_u32(argp->xdr, &lockt->lt_type) < 0)
 		return nfserr_bad_xdr;
@@ -940,8 +949,9 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
 }
 
 static __be32
-nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
+nfsd4_decode_locku(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
 {
+	struct nfsd4_locku *locku = &u->locku;
 	__be32 status;
 
 	if (xdr_stream_decode_u32(argp->xdr, &locku->lu_type) < 0)
@@ -962,8 +972,9 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
 }
 
 static __be32
-nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup)
+nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
 {
+	struct nfsd4_lookup *lookup = &u->lookup;
 	return nfsd4_decode_component4(argp, &lookup->lo_name, &lookup->lo_len);
 }
 
@@ -1143,8 +1154,9 @@ nfsd4_decode_open_claim4(struct nfsd4_compoundargs *argp,
 }
 
 static __be32
-nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
+nfsd4_decode_open(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
 {
+	struct nfsd4_open *open = &u->open;
 	__be32 status;
 	u32 dummy;
 
@@ -1171,8 +1183,10 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
 }
 
 static __be32
-nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_confirm *open_conf)
+nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp,
+			  union nfsd4_op_u *u)
 {
+	struct nfsd4_open_confirm *open_conf = &u->open_confirm;
 	__be32 status;
 
 	if (argp->minorversion >= 1)
@@ -1190,8 +1204,10 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con
 }
 
 static __be32
-nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_downgrade *open_down)
+nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp,
+			    union nfsd4_op_u *u)
 {
+	struct nfsd4_open_downgrade *open_down = &u->open_downgrade;
 	__be32 status;
 
 	memset(open_down, 0, sizeof(*open_down));
@@ -1209,8 +1225,9 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d
 }
 
 static __be32
-nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh)
+nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
 {
+	struct nfsd4_putfh *putfh = &u->putfh;
 	__be32 *p;
 
 	if (xdr_stream_decode_u32(argp->xdr, &putfh->pf_fhlen) < 0)
@@ -1229,7 +1246,7 @@ nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh)
 }
 
 static __be32
-nfsd4_decode_putpubfh(struct nfsd4_compoundargs *argp, void *p)
+nfsd4_decode_putpubfh(struct nfsd4_compoundargs *argp, union nfsd4_op_u *p)
 {
 	if (argp->minorversion == 0)
 		return nfs_ok;
@@ -1237,8 +1254,9 @@ nfsd4_decode_putpubfh(struct nfsd4_compoundargs *argp, void *p)
 }
 
 static __be32
-nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read)
+nfsd4_decode_read(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
 {
+	struct nfsd4_read *read = &u->read;
 	__be32 status;
 
 	memset(read, 0, sizeof(*read));
@@ -1254,8 +1272,9 @@ nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read)
 }
 
 static __be32
-nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *readdir)
+nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
 {
+	struct nfsd4_readdir *readdir = &u->readdir;
 	__be32 status;
 
 	memset(readdir, 0, sizeof(*readdir));
@@ -1276,15 +1295,17 @@ nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *read
 }
 
 static __be32
-nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove)
+nfsd4_decode_remove(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
 {
+	struct nfsd4_remove *remove = &u->remove;
 	memset(&remove->rm_cinfo, 0, sizeof(remove->rm_cinfo));
 	return nfsd4_decode_component4(argp, &remove->rm_name, &remove->rm_namelen);
 }
 
 static __be32
-nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename)
+nfsd4_decode_rename(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
 {
+	struct nfsd4_rename *rename = &u->rename;
 	__be32 status;
 
 	memset(rename, 0, sizeof(*rename));
@@ -1295,22 +1316,25 @@ nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename
 }
 
 static __be32
-nfsd4_decode_renew(struct nfsd4_compoundargs *argp, clientid_t *clientid)
+nfsd4_decode_renew(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
 {
+	clientid_t *clientid = &u->renew;
 	return nfsd4_decode_clientid4(argp, clientid);
 }
 
 static __be32
 nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
-		     struct nfsd4_secinfo *secinfo)
+		     union nfsd4_op_u *u)
 {
+	struct nfsd4_secinfo *secinfo = &u->secinfo;
 	secinfo->si_exp = NULL;
 	return nfsd4_decode_component4(argp, &secinfo->si_name, &secinfo->si_namelen);
 }
 
 static __be32
-nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr)
+nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
 {
+	struct nfsd4_setattr *setattr = &u->setattr;
 	__be32 status;
 
 	memset(setattr, 0, sizeof(*setattr));
@@ -1324,8 +1348,9 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta
 }
 
 static __be32
-nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid *setclientid)
+nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
 {
+	struct nfsd4_setclientid *setclientid = &u->setclientid;
 	__be32 *p, status;
 
 	memset(setclientid, 0, sizeof(*setclientid));
@@ -1367,8 +1392,10 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient
 }
 
 static __be32
-nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid_confirm *scd_c)
+nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp,
+				 union nfsd4_op_u *u)
 {
+	struct nfsd4_setclientid_confirm *scd_c = &u->setclientid_confirm;
 	__be32 status;
 
 	if (argp->minorversion >= 1)
@@ -1382,8 +1409,9 @@ nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_s
 
 /* Also used for NVERIFY */
 static __be32
-nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify)
+nfsd4_decode_verify(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
 {
+	struct nfsd4_verify *verify = &u->verify;
 	__be32 *p, status;
 
 	memset(verify, 0, sizeof(*verify));
@@ -1409,8 +1437,9 @@ nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify
 }
 
 static __be32
-nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
+nfsd4_decode_write(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
 {
+	struct nfsd4_write *write = &u->write;
 	__be32 status;
 
 	status = nfsd4_decode_stateid4(argp, &write->wr_stateid);
@@ -1434,8 +1463,10 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
 }
 
 static __be32
-nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_release_lockowner *rlockowner)
+nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp,
+			       union nfsd4_op_u *u)
 {
+	struct nfsd4_release_lockowner *rlockowner = &u->release_lockowner;
 	__be32 status;
 
 	if (argp->minorversion >= 1)
@@ -1452,16 +1483,20 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel
 	return nfs_ok;
 }
 
-static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, struct nfsd4_backchannel_ctl *bc)
+static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp,
+					   union nfsd4_op_u *u)
 {
+	struct nfsd4_backchannel_ctl *bc = &u->backchannel_ctl;
 	memset(bc, 0, sizeof(*bc));
 	if (xdr_stream_decode_u32(argp->xdr, &bc->bc_cb_program) < 0)
 		return nfserr_bad_xdr;
 	return nfsd4_decode_cb_sec(argp, &bc->bc_cb_sec);
 }
 
-static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts)
+static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp,
+						union nfsd4_op_u *u)
 {
+	struct nfsd4_bind_conn_to_session *bcts = &u->bind_conn_to_session;
 	u32 use_conn_in_rdma_mode;
 	__be32 status;
 
@@ -1603,8 +1638,9 @@ nfsd4_decode_nfs_impl_id4(struct nfsd4_compoundargs *argp,
 
 static __be32
 nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
-			 struct nfsd4_exchange_id *exid)
+			 union nfsd4_op_u *u)
 {
+	struct nfsd4_exchange_id *exid = &u->exchange_id;
 	__be32 status;
 
 	memset(exid, 0, sizeof(*exid));
@@ -1656,8 +1692,9 @@ nfsd4_decode_channel_attrs4(struct nfsd4_compoundargs *argp,
 
 static __be32
 nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
-			    struct nfsd4_create_session *sess)
+			    union nfsd4_op_u *u)
 {
+	struct nfsd4_create_session *sess = &u->create_session;
 	__be32 status;
 
 	memset(sess, 0, sizeof(*sess));
@@ -1681,23 +1718,26 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
 
 static __be32
 nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp,
-			     struct nfsd4_destroy_session *destroy_session)
+			     union nfsd4_op_u *u)
 {
+	struct nfsd4_destroy_session *destroy_session = &u->destroy_session;
 	return nfsd4_decode_sessionid4(argp, &destroy_session->sessionid);
 }
 
 static __be32
 nfsd4_decode_free_stateid(struct nfsd4_compoundargs *argp,
-			  struct nfsd4_free_stateid *free_stateid)
+			  union nfsd4_op_u *u)
 {
+	struct nfsd4_free_stateid *free_stateid = &u->free_stateid;
 	return nfsd4_decode_stateid4(argp, &free_stateid->fr_stateid);
 }
 
 #ifdef CONFIG_NFSD_PNFS
 static __be32
 nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp,
-		struct nfsd4_getdeviceinfo *gdev)
+		union nfsd4_op_u *u)
 {
+	struct nfsd4_getdeviceinfo *gdev = &u->getdeviceinfo;
 	__be32 status;
 
 	memset(gdev, 0, sizeof(*gdev));
@@ -1717,8 +1757,9 @@ nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp,
 
 static __be32
 nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp,
-			  struct nfsd4_layoutcommit *lcp)
+			  union nfsd4_op_u *u)
 {
+	struct nfsd4_layoutcommit *lcp = &u->layoutcommit;
 	__be32 *p, status;
 
 	memset(lcp, 0, sizeof(*lcp));
@@ -1753,8 +1794,9 @@ nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp,
 
 static __be32
 nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp,
-		struct nfsd4_layoutget *lgp)
+		union nfsd4_op_u *u)
 {
+	struct nfsd4_layoutget *lgp = &u->layoutget;
 	__be32 status;
 
 	memset(lgp, 0, sizeof(*lgp));
@@ -1781,8 +1823,9 @@ nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp,
 
 static __be32
 nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp,
-		struct nfsd4_layoutreturn *lrp)
+		union nfsd4_op_u *u)
 {
+	struct nfsd4_layoutreturn *lrp = &u->layoutreturn;
 	memset(lrp, 0, sizeof(*lrp));
 	if (xdr_stream_decode_bool(argp->xdr, &lrp->lr_reclaim) < 0)
 		return nfserr_bad_xdr;
@@ -1795,8 +1838,9 @@ nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp,
 #endif /* CONFIG_NFSD_PNFS */
 
 static __be32 nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp,
-					   struct nfsd4_secinfo_no_name *sin)
+					   union nfsd4_op_u *u)
 {
+	struct nfsd4_secinfo_no_name *sin = &u->secinfo_no_name;
 	if (xdr_stream_decode_u32(argp->xdr, &sin->sin_style) < 0)
 		return nfserr_bad_xdr;
 
@@ -1806,8 +1850,9 @@ static __be32 nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp,
 
 static __be32
 nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
-		      struct nfsd4_sequence *seq)
+		      union nfsd4_op_u *u)
 {
+	struct nfsd4_sequence *seq = &u->sequence;
 	__be32 *p, status;
 
 	status = nfsd4_decode_sessionid4(argp, &seq->sessionid);
@@ -1826,8 +1871,10 @@ nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
 }
 
 static __be32
-nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_stateid *test_stateid)
+nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp,
+			  union nfsd4_op_u *u)
 {
+	struct nfsd4_test_stateid *test_stateid = &u->test_stateid;
 	struct nfsd4_test_stateid_id *stateid;
 	__be32 status;
 	u32 i;
@@ -1852,14 +1899,16 @@ nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_sta
 }
 
 static __be32 nfsd4_decode_destroy_clientid(struct nfsd4_compoundargs *argp,
-					    struct nfsd4_destroy_clientid *dc)
+					    union nfsd4_op_u *u)
 {
+	struct nfsd4_destroy_clientid *dc = &u->destroy_clientid;
 	return nfsd4_decode_clientid4(argp, &dc->clientid);
 }
 
 static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp,
-					    struct nfsd4_reclaim_complete *rc)
+					    union nfsd4_op_u *u)
 {
+	struct nfsd4_reclaim_complete *rc = &u->reclaim_complete;
 	if (xdr_stream_decode_bool(argp->xdr, &rc->rca_one_fs) < 0)
 		return nfserr_bad_xdr;
 	return nfs_ok;
@@ -1867,8 +1916,9 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp,
 
 static __be32
 nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp,
-		       struct nfsd4_fallocate *fallocate)
+		       union nfsd4_op_u *u)
 {
+	struct nfsd4_fallocate *fallocate = &u->allocate;
 	__be32 status;
 
 	status = nfsd4_decode_stateid4(argp, &fallocate->falloc_stateid);
@@ -1924,8 +1974,9 @@ static __be32 nfsd4_decode_nl4_server(struct nfsd4_compoundargs *argp,
 }
 
 static __be32
-nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy)
+nfsd4_decode_copy(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
 {
+	struct nfsd4_copy *copy = &u->copy;
 	u32 consecutive, i, count, sync;
 	struct nl4_server *ns_dummy;
 	__be32 status;
@@ -1982,8 +2033,9 @@ nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy)
 
 static __be32
 nfsd4_decode_copy_notify(struct nfsd4_compoundargs *argp,
-			 struct nfsd4_copy_notify *cn)
+			 union nfsd4_op_u *u)
 {
+	struct nfsd4_copy_notify *cn = &u->copy_notify;
 	__be32 status;
 
 	memset(cn, 0, sizeof(*cn));
@@ -2002,16 +2054,18 @@ nfsd4_decode_copy_notify(struct nfsd4_compoundargs *argp,
 
 static __be32
 nfsd4_decode_offload_status(struct nfsd4_compoundargs *argp,
-			    struct nfsd4_offload_status *os)
+			    union nfsd4_op_u *u)
 {
+	struct nfsd4_offload_status *os = &u->offload_status;
 	os->count = 0;
 	os->status = 0;
 	return nfsd4_decode_stateid4(argp, &os->stateid);
 }
 
 static __be32
-nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
+nfsd4_decode_seek(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
 {
+	struct nfsd4_seek *seek = &u->seek;
 	__be32 status;
 
 	status = nfsd4_decode_stateid4(argp, &seek->seek_stateid);
@@ -2028,8 +2082,9 @@ nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
 }
 
 static __be32
-nfsd4_decode_clone(struct nfsd4_compoundargs *argp, struct nfsd4_clone *clone)
+nfsd4_decode_clone(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
 {
+	struct nfsd4_clone *clone = &u->clone;
 	__be32 status;
 
 	status = nfsd4_decode_stateid4(argp, &clone->cl_src_stateid);
@@ -2154,8 +2209,9 @@ nfsd4_decode_xattr_name(struct nfsd4_compoundargs *argp, char **namep)
  */
 static __be32
 nfsd4_decode_getxattr(struct nfsd4_compoundargs *argp,
-		      struct nfsd4_getxattr *getxattr)
+		      union nfsd4_op_u *u)
 {
+	struct nfsd4_getxattr *getxattr = &u->getxattr;
 	__be32 status;
 	u32 maxcount;
 
@@ -2173,8 +2229,9 @@ nfsd4_decode_getxattr(struct nfsd4_compoundargs *argp,
 
 static __be32
 nfsd4_decode_setxattr(struct nfsd4_compoundargs *argp,
-		      struct nfsd4_setxattr *setxattr)
+		      union nfsd4_op_u *u)
 {
+	struct nfsd4_setxattr *setxattr = &u->setxattr;
 	u32 flags, maxcount, size;
 	__be32 status;
 
@@ -2214,8 +2271,9 @@ nfsd4_decode_setxattr(struct nfsd4_compoundargs *argp,
 
 static __be32
 nfsd4_decode_listxattrs(struct nfsd4_compoundargs *argp,
-			struct nfsd4_listxattrs *listxattrs)
+			union nfsd4_op_u *u)
 {
+	struct nfsd4_listxattrs *listxattrs = &u->listxattrs;
 	u32 maxcount;
 
 	memset(listxattrs, 0, sizeof(*listxattrs));
@@ -2245,113 +2303,114 @@ nfsd4_decode_listxattrs(struct nfsd4_compoundargs *argp,
 
 static __be32
 nfsd4_decode_removexattr(struct nfsd4_compoundargs *argp,
-			 struct nfsd4_removexattr *removexattr)
+			 union nfsd4_op_u *u)
 {
+	struct nfsd4_removexattr *removexattr = &u->removexattr;
 	memset(removexattr, 0, sizeof(*removexattr));
 	return nfsd4_decode_xattr_name(argp, &removexattr->rmxa_name);
 }
 
 static __be32
-nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
+nfsd4_decode_noop(struct nfsd4_compoundargs *argp, union nfsd4_op_u *p)
 {
 	return nfs_ok;
 }
 
 static __be32
-nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p)
+nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, union nfsd4_op_u *p)
 {
 	return nfserr_notsupp;
 }
 
-typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *);
+typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u);
 
 static const nfsd4_dec nfsd4_dec_ops[] = {
-	[OP_ACCESS]		= (nfsd4_dec)nfsd4_decode_access,
-	[OP_CLOSE]		= (nfsd4_dec)nfsd4_decode_close,
-	[OP_COMMIT]		= (nfsd4_dec)nfsd4_decode_commit,
-	[OP_CREATE]		= (nfsd4_dec)nfsd4_decode_create,
-	[OP_DELEGPURGE]		= (nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_DELEGRETURN]	= (nfsd4_dec)nfsd4_decode_delegreturn,
-	[OP_GETATTR]		= (nfsd4_dec)nfsd4_decode_getattr,
-	[OP_GETFH]		= (nfsd4_dec)nfsd4_decode_noop,
-	[OP_LINK]		= (nfsd4_dec)nfsd4_decode_link,
-	[OP_LOCK]		= (nfsd4_dec)nfsd4_decode_lock,
-	[OP_LOCKT]		= (nfsd4_dec)nfsd4_decode_lockt,
-	[OP_LOCKU]		= (nfsd4_dec)nfsd4_decode_locku,
-	[OP_LOOKUP]		= (nfsd4_dec)nfsd4_decode_lookup,
-	[OP_LOOKUPP]		= (nfsd4_dec)nfsd4_decode_noop,
-	[OP_NVERIFY]		= (nfsd4_dec)nfsd4_decode_verify,
-	[OP_OPEN]		= (nfsd4_dec)nfsd4_decode_open,
-	[OP_OPENATTR]		= (nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_OPEN_CONFIRM]	= (nfsd4_dec)nfsd4_decode_open_confirm,
-	[OP_OPEN_DOWNGRADE]	= (nfsd4_dec)nfsd4_decode_open_downgrade,
-	[OP_PUTFH]		= (nfsd4_dec)nfsd4_decode_putfh,
-	[OP_PUTPUBFH]		= (nfsd4_dec)nfsd4_decode_putpubfh,
-	[OP_PUTROOTFH]		= (nfsd4_dec)nfsd4_decode_noop,
-	[OP_READ]		= (nfsd4_dec)nfsd4_decode_read,
-	[OP_READDIR]		= (nfsd4_dec)nfsd4_decode_readdir,
-	[OP_READLINK]		= (nfsd4_dec)nfsd4_decode_noop,
-	[OP_REMOVE]		= (nfsd4_dec)nfsd4_decode_remove,
-	[OP_RENAME]		= (nfsd4_dec)nfsd4_decode_rename,
-	[OP_RENEW]		= (nfsd4_dec)nfsd4_decode_renew,
-	[OP_RESTOREFH]		= (nfsd4_dec)nfsd4_decode_noop,
-	[OP_SAVEFH]		= (nfsd4_dec)nfsd4_decode_noop,
-	[OP_SECINFO]		= (nfsd4_dec)nfsd4_decode_secinfo,
-	[OP_SETATTR]		= (nfsd4_dec)nfsd4_decode_setattr,
-	[OP_SETCLIENTID]	= (nfsd4_dec)nfsd4_decode_setclientid,
-	[OP_SETCLIENTID_CONFIRM] = (nfsd4_dec)nfsd4_decode_setclientid_confirm,
-	[OP_VERIFY]		= (nfsd4_dec)nfsd4_decode_verify,
-	[OP_WRITE]		= (nfsd4_dec)nfsd4_decode_write,
-	[OP_RELEASE_LOCKOWNER]	= (nfsd4_dec)nfsd4_decode_release_lockowner,
+	[OP_ACCESS]		= nfsd4_decode_access,
+	[OP_CLOSE]		= nfsd4_decode_close,
+	[OP_COMMIT]		= nfsd4_decode_commit,
+	[OP_CREATE]		= nfsd4_decode_create,
+	[OP_DELEGPURGE]		= nfsd4_decode_notsupp,
+	[OP_DELEGRETURN]	= nfsd4_decode_delegreturn,
+	[OP_GETATTR]		= nfsd4_decode_getattr,
+	[OP_GETFH]		= nfsd4_decode_noop,
+	[OP_LINK]		= nfsd4_decode_link,
+	[OP_LOCK]		= nfsd4_decode_lock,
+	[OP_LOCKT]		= nfsd4_decode_lockt,
+	[OP_LOCKU]		= nfsd4_decode_locku,
+	[OP_LOOKUP]		= nfsd4_decode_lookup,
+	[OP_LOOKUPP]		= nfsd4_decode_noop,
+	[OP_NVERIFY]		= nfsd4_decode_verify,
+	[OP_OPEN]		= nfsd4_decode_open,
+	[OP_OPENATTR]		= nfsd4_decode_notsupp,
+	[OP_OPEN_CONFIRM]	= nfsd4_decode_open_confirm,
+	[OP_OPEN_DOWNGRADE]	= nfsd4_decode_open_downgrade,
+	[OP_PUTFH]		= nfsd4_decode_putfh,
+	[OP_PUTPUBFH]		= nfsd4_decode_putpubfh,
+	[OP_PUTROOTFH]		= nfsd4_decode_noop,
+	[OP_READ]		= nfsd4_decode_read,
+	[OP_READDIR]		= nfsd4_decode_readdir,
+	[OP_READLINK]		= nfsd4_decode_noop,
+	[OP_REMOVE]		= nfsd4_decode_remove,
+	[OP_RENAME]		= nfsd4_decode_rename,
+	[OP_RENEW]		= nfsd4_decode_renew,
+	[OP_RESTOREFH]		= nfsd4_decode_noop,
+	[OP_SAVEFH]		= nfsd4_decode_noop,
+	[OP_SECINFO]		= nfsd4_decode_secinfo,
+	[OP_SETATTR]		= nfsd4_decode_setattr,
+	[OP_SETCLIENTID]	= nfsd4_decode_setclientid,
+	[OP_SETCLIENTID_CONFIRM] = nfsd4_decode_setclientid_confirm,
+	[OP_VERIFY]		= nfsd4_decode_verify,
+	[OP_WRITE]		= nfsd4_decode_write,
+	[OP_RELEASE_LOCKOWNER]	= nfsd4_decode_release_lockowner,
 
 	/* new operations for NFSv4.1 */
-	[OP_BACKCHANNEL_CTL]	= (nfsd4_dec)nfsd4_decode_backchannel_ctl,
-	[OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_bind_conn_to_session,
-	[OP_EXCHANGE_ID]	= (nfsd4_dec)nfsd4_decode_exchange_id,
-	[OP_CREATE_SESSION]	= (nfsd4_dec)nfsd4_decode_create_session,
-	[OP_DESTROY_SESSION]	= (nfsd4_dec)nfsd4_decode_destroy_session,
-	[OP_FREE_STATEID]	= (nfsd4_dec)nfsd4_decode_free_stateid,
-	[OP_GET_DIR_DELEGATION]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_BACKCHANNEL_CTL]	= nfsd4_decode_backchannel_ctl,
+	[OP_BIND_CONN_TO_SESSION] = nfsd4_decode_bind_conn_to_session,
+	[OP_EXCHANGE_ID]	= nfsd4_decode_exchange_id,
+	[OP_CREATE_SESSION]	= nfsd4_decode_create_session,
+	[OP_DESTROY_SESSION]	= nfsd4_decode_destroy_session,
+	[OP_FREE_STATEID]	= nfsd4_decode_free_stateid,
+	[OP_GET_DIR_DELEGATION]	= nfsd4_decode_notsupp,
 #ifdef CONFIG_NFSD_PNFS
-	[OP_GETDEVICEINFO]	= (nfsd4_dec)nfsd4_decode_getdeviceinfo,
-	[OP_GETDEVICELIST]	= (nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_LAYOUTCOMMIT]	= (nfsd4_dec)nfsd4_decode_layoutcommit,
-	[OP_LAYOUTGET]		= (nfsd4_dec)nfsd4_decode_layoutget,
-	[OP_LAYOUTRETURN]	= (nfsd4_dec)nfsd4_decode_layoutreturn,
+	[OP_GETDEVICEINFO]	= nfsd4_decode_getdeviceinfo,
+	[OP_GETDEVICELIST]	= nfsd4_decode_notsupp,
+	[OP_LAYOUTCOMMIT]	= nfsd4_decode_layoutcommit,
+	[OP_LAYOUTGET]		= nfsd4_decode_layoutget,
+	[OP_LAYOUTRETURN]	= nfsd4_decode_layoutreturn,
 #else
-	[OP_GETDEVICEINFO]	= (nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_GETDEVICELIST]	= (nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_LAYOUTCOMMIT]	= (nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_LAYOUTGET]		= (nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_LAYOUTRETURN]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_GETDEVICEINFO]	= nfsd4_decode_notsupp,
+	[OP_GETDEVICELIST]	= nfsd4_decode_notsupp,
+	[OP_LAYOUTCOMMIT]	= nfsd4_decode_notsupp,
+	[OP_LAYOUTGET]		= nfsd4_decode_notsupp,
+	[OP_LAYOUTRETURN]	= nfsd4_decode_notsupp,
 #endif
-	[OP_SECINFO_NO_NAME]	= (nfsd4_dec)nfsd4_decode_secinfo_no_name,
-	[OP_SEQUENCE]		= (nfsd4_dec)nfsd4_decode_sequence,
-	[OP_SET_SSV]		= (nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_TEST_STATEID]	= (nfsd4_dec)nfsd4_decode_test_stateid,
-	[OP_WANT_DELEGATION]	= (nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_DESTROY_CLIENTID]	= (nfsd4_dec)nfsd4_decode_destroy_clientid,
-	[OP_RECLAIM_COMPLETE]	= (nfsd4_dec)nfsd4_decode_reclaim_complete,
+	[OP_SECINFO_NO_NAME]	= nfsd4_decode_secinfo_no_name,
+	[OP_SEQUENCE]		= nfsd4_decode_sequence,
+	[OP_SET_SSV]		= nfsd4_decode_notsupp,
+	[OP_TEST_STATEID]	= nfsd4_decode_test_stateid,
+	[OP_WANT_DELEGATION]	= nfsd4_decode_notsupp,
+	[OP_DESTROY_CLIENTID]	= nfsd4_decode_destroy_clientid,
+	[OP_RECLAIM_COMPLETE]	= nfsd4_decode_reclaim_complete,
 
 	/* new operations for NFSv4.2 */
-	[OP_ALLOCATE]		= (nfsd4_dec)nfsd4_decode_fallocate,
-	[OP_COPY]		= (nfsd4_dec)nfsd4_decode_copy,
-	[OP_COPY_NOTIFY]	= (nfsd4_dec)nfsd4_decode_copy_notify,
-	[OP_DEALLOCATE]		= (nfsd4_dec)nfsd4_decode_fallocate,
-	[OP_IO_ADVISE]		= (nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_LAYOUTERROR]	= (nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_LAYOUTSTATS]	= (nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_OFFLOAD_CANCEL]	= (nfsd4_dec)nfsd4_decode_offload_status,
-	[OP_OFFLOAD_STATUS]	= (nfsd4_dec)nfsd4_decode_offload_status,
-	[OP_READ_PLUS]		= (nfsd4_dec)nfsd4_decode_read,
-	[OP_SEEK]		= (nfsd4_dec)nfsd4_decode_seek,
-	[OP_WRITE_SAME]		= (nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_CLONE]		= (nfsd4_dec)nfsd4_decode_clone,
+	[OP_ALLOCATE]		= nfsd4_decode_fallocate,
+	[OP_COPY]		= nfsd4_decode_copy,
+	[OP_COPY_NOTIFY]	= nfsd4_decode_copy_notify,
+	[OP_DEALLOCATE]		= nfsd4_decode_fallocate,
+	[OP_IO_ADVISE]		= nfsd4_decode_notsupp,
+	[OP_LAYOUTERROR]	= nfsd4_decode_notsupp,
+	[OP_LAYOUTSTATS]	= nfsd4_decode_notsupp,
+	[OP_OFFLOAD_CANCEL]	= nfsd4_decode_offload_status,
+	[OP_OFFLOAD_STATUS]	= nfsd4_decode_offload_status,
+	[OP_READ_PLUS]		= nfsd4_decode_read,
+	[OP_SEEK]		= nfsd4_decode_seek,
+	[OP_WRITE_SAME]		= nfsd4_decode_notsupp,
+	[OP_CLONE]		= nfsd4_decode_clone,
 	/* RFC 8276 extended atributes operations */
-	[OP_GETXATTR]		= (nfsd4_dec)nfsd4_decode_getxattr,
-	[OP_SETXATTR]		= (nfsd4_dec)nfsd4_decode_setxattr,
-	[OP_LISTXATTRS]		= (nfsd4_dec)nfsd4_decode_listxattrs,
-	[OP_REMOVEXATTR]	= (nfsd4_dec)nfsd4_decode_removexattr,
+	[OP_GETXATTR]		= nfsd4_decode_getxattr,
+	[OP_SETXATTR]		= nfsd4_decode_setxattr,
+	[OP_LISTXATTRS]		= nfsd4_decode_listxattrs,
+	[OP_REMOVEXATTR]	= nfsd4_decode_removexattr,
 };
 
 static inline bool
@@ -3630,8 +3689,10 @@ nfsd4_encode_stateid(struct xdr_stream *xdr, stateid_t *sid)
 }
 
 static __be32
-nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access)
+nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr,
+		    union nfsd4_op_u *u)
 {
+	struct nfsd4_access *access = &u->access;
 	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
@@ -3643,8 +3704,10 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 	return 0;
 }
 
-static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts)
+static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr,
+						union nfsd4_op_u *u)
 {
+	struct nfsd4_bind_conn_to_session *bcts = &u->bind_conn_to_session;
 	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
@@ -3660,8 +3723,10 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp,
 }
 
 static __be32
-nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close)
+nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr,
+		   union nfsd4_op_u *u)
 {
+	struct nfsd4_close *close = &u->close;
 	struct xdr_stream *xdr = resp->xdr;
 
 	return nfsd4_encode_stateid(xdr, &close->cl_stateid);
@@ -3669,8 +3734,10 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c
 
 
 static __be32
-nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit)
+nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr,
+		    union nfsd4_op_u *u)
 {
+	struct nfsd4_commit *commit = &u->commit;
 	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
@@ -3683,8 +3750,10 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 }
 
 static __be32
-nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create)
+nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr,
+		    union nfsd4_op_u *u)
 {
+	struct nfsd4_create *create = &u->create;
 	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
@@ -3697,8 +3766,10 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 }
 
 static __be32
-nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_getattr *getattr)
+nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr,
+		     union nfsd4_op_u *u)
 {
+	struct nfsd4_getattr *getattr = &u->getattr;
 	struct svc_fh *fhp = getattr->ga_fhp;
 	struct xdr_stream *xdr = resp->xdr;
 
@@ -3707,8 +3778,10 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
 }
 
 static __be32
-nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh **fhpp)
+nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr,
+		   union nfsd4_op_u *u)
 {
+	struct svc_fh **fhpp = &u->getfh;
 	struct xdr_stream *xdr = resp->xdr;
 	struct svc_fh *fhp = *fhpp;
 	unsigned int len;
@@ -3762,8 +3835,10 @@ again:
 }
 
 static __be32
-nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock)
+nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr,
+		  union nfsd4_op_u *u)
 {
+	struct nfsd4_lock *lock = &u->lock;
 	struct xdr_stream *xdr = resp->xdr;
 
 	if (!nfserr)
@@ -3775,8 +3850,10 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo
 }
 
 static __be32
-nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lockt *lockt)
+nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr,
+		   union nfsd4_op_u *u)
 {
+	struct nfsd4_lockt *lockt = &u->lockt;
 	struct xdr_stream *xdr = resp->xdr;
 
 	if (nfserr == nfserr_denied)
@@ -3785,8 +3862,10 @@ nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l
 }
 
 static __be32
-nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku)
+nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr,
+		   union nfsd4_op_u *u)
 {
+	struct nfsd4_locku *locku = &u->locku;
 	struct xdr_stream *xdr = resp->xdr;
 
 	return nfsd4_encode_stateid(xdr, &locku->lu_stateid);
@@ -3794,8 +3873,10 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l
 
 
 static __be32
-nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link)
+nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr,
+		  union nfsd4_op_u *u)
 {
+	struct nfsd4_link *link = &u->link;
 	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
@@ -3808,8 +3889,10 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li
 
 
 static __be32
-nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open)
+nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr,
+		  union nfsd4_op_u *u)
 {
+	struct nfsd4_open *open = &u->open;
 	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
@@ -3902,16 +3985,20 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
 }
 
 static __be32
-nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc)
+nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr,
+			  union nfsd4_op_u *u)
 {
+	struct nfsd4_open_confirm *oc = &u->open_confirm;
 	struct xdr_stream *xdr = resp->xdr;
 
 	return nfsd4_encode_stateid(xdr, &oc->oc_resp_stateid);
 }
 
 static __be32
-nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od)
+nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr,
+			    union nfsd4_op_u *u)
 {
+	struct nfsd4_open_downgrade *od = &u->open_downgrade;
 	struct xdr_stream *xdr = resp->xdr;
 
 	return nfsd4_encode_stateid(xdr, &od->od_stateid);
@@ -4010,8 +4097,9 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
 
 static __be32
 nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
-		  struct nfsd4_read *read)
+		  union nfsd4_op_u *u)
 {
+	struct nfsd4_read *read = &u->read;
 	bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags);
 	unsigned long maxcount;
 	struct xdr_stream *xdr = resp->xdr;
@@ -4052,8 +4140,10 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
 }
 
 static __be32
-nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readlink *readlink)
+nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr,
+		      union nfsd4_op_u *u)
 {
+	struct nfsd4_readlink *readlink = &u->readlink;
 	__be32 *p, *maxcount_p, zero = xdr_zero;
 	struct xdr_stream *xdr = resp->xdr;
 	int length_offset = xdr->buf->len;
@@ -4097,8 +4187,10 @@ out_err:
 }
 
 static __be32
-nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readdir *readdir)
+nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr,
+		     union nfsd4_op_u *u)
 {
+	struct nfsd4_readdir *readdir = &u->readdir;
 	int maxcount;
 	int bytes_left;
 	loff_t offset;
@@ -4188,8 +4280,10 @@ err_no_verf:
 }
 
 static __be32
-nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove)
+nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr,
+		    union nfsd4_op_u *u)
 {
+	struct nfsd4_remove *remove = &u->remove;
 	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
@@ -4201,8 +4295,10 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 }
 
 static __be32
-nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename)
+nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr,
+		    union nfsd4_op_u *u)
 {
+	struct nfsd4_rename *rename = &u->rename;
 	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
@@ -4284,8 +4380,9 @@ nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp)
 
 static __be32
 nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
-		     struct nfsd4_secinfo *secinfo)
+		     union nfsd4_op_u *u)
 {
+	struct nfsd4_secinfo *secinfo = &u->secinfo;
 	struct xdr_stream *xdr = resp->xdr;
 
 	return nfsd4_do_encode_secinfo(xdr, secinfo->si_exp);
@@ -4293,8 +4390,9 @@ nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
 
 static __be32
 nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr,
-		     struct nfsd4_secinfo_no_name *secinfo)
+		     union nfsd4_op_u *u)
 {
+	struct nfsd4_secinfo_no_name *secinfo = &u->secinfo_no_name;
 	struct xdr_stream *xdr = resp->xdr;
 
 	return nfsd4_do_encode_secinfo(xdr, secinfo->sin_exp);
@@ -4305,8 +4403,10 @@ nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr,
  * regardless of the error status.
  */
 static __be32
-nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr)
+nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr,
+		     union nfsd4_op_u *u)
 {
+	struct nfsd4_setattr *setattr = &u->setattr;
 	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
@@ -4329,8 +4429,10 @@ nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
 }
 
 static __be32
-nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd)
+nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr,
+			 union nfsd4_op_u *u)
 {
+	struct nfsd4_setclientid *scd = &u->setclientid;
 	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
@@ -4353,8 +4455,10 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct n
 }
 
 static __be32
-nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write)
+nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr,
+		   union nfsd4_op_u *u)
 {
+	struct nfsd4_write *write = &u->write;
 	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
@@ -4370,8 +4474,9 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
 
 static __be32
 nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
-			 struct nfsd4_exchange_id *exid)
+			 union nfsd4_op_u *u)
 {
+	struct nfsd4_exchange_id *exid = &u->exchange_id;
 	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 	char *major_id;
@@ -4448,8 +4553,9 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
 
 static __be32
 nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr,
-			    struct nfsd4_create_session *sess)
+			    union nfsd4_op_u *u)
 {
+	struct nfsd4_create_session *sess = &u->create_session;
 	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
@@ -4501,8 +4607,9 @@ nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr,
 
 static __be32
 nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr,
-		      struct nfsd4_sequence *seq)
+		      union nfsd4_op_u *u)
 {
+	struct nfsd4_sequence *seq = &u->sequence;
 	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
@@ -4524,8 +4631,9 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr,
 
 static __be32
 nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
-			  struct nfsd4_test_stateid *test_stateid)
+			  union nfsd4_op_u *u)
 {
+	struct nfsd4_test_stateid *test_stateid = &u->test_stateid;
 	struct xdr_stream *xdr = resp->xdr;
 	struct nfsd4_test_stateid_id *stateid, *next;
 	__be32 *p;
@@ -4545,8 +4653,9 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
 #ifdef CONFIG_NFSD_PNFS
 static __be32
 nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
-		struct nfsd4_getdeviceinfo *gdev)
+		union nfsd4_op_u *u)
 {
+	struct nfsd4_getdeviceinfo *gdev = &u->getdeviceinfo;
 	struct xdr_stream *xdr = resp->xdr;
 	const struct nfsd4_layout_ops *ops;
 	u32 starting_len = xdr->buf->len, needed_len;
@@ -4601,8 +4710,9 @@ toosmall:
 
 static __be32
 nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
-		struct nfsd4_layoutget *lgp)
+		union nfsd4_op_u *u)
 {
+	struct nfsd4_layoutget *lgp = &u->layoutget;
 	struct xdr_stream *xdr = resp->xdr;
 	const struct nfsd4_layout_ops *ops;
 	__be32 *p;
@@ -4628,8 +4738,9 @@ nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
 
 static __be32
 nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr,
-			  struct nfsd4_layoutcommit *lcp)
+			  union nfsd4_op_u *u)
 {
+	struct nfsd4_layoutcommit *lcp = &u->layoutcommit;
 	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
@@ -4649,8 +4760,9 @@ nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr,
 
 static __be32
 nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
-		struct nfsd4_layoutreturn *lrp)
+		union nfsd4_op_u *u)
 {
+	struct nfsd4_layoutreturn *lrp = &u->layoutreturn;
 	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
@@ -4735,8 +4847,9 @@ nfsd42_encode_nl4_server(struct nfsd4_compoundres *resp, struct nl4_server *ns)
 
 static __be32
 nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr,
-		  struct nfsd4_copy *copy)
+		  union nfsd4_op_u *u)
 {
+	struct nfsd4_copy *copy = &u->copy;
 	__be32 *p;
 
 	nfserr = nfsd42_encode_write_res(resp, &copy->cp_res,
@@ -4752,8 +4865,9 @@ nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr,
 
 static __be32
 nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
-			    struct nfsd4_offload_status *os)
+			    union nfsd4_op_u *u)
 {
+	struct nfsd4_offload_status *os = &u->offload_status;
 	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
@@ -4803,8 +4917,9 @@ nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
 
 static __be32
 nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
-		       struct nfsd4_read *read)
+		       union nfsd4_op_u *u)
 {
+	struct nfsd4_read *read = &u->read;
 	struct file *file = read->rd_nf->nf_file;
 	struct xdr_stream *xdr = resp->xdr;
 	int starting_len = xdr->buf->len;
@@ -4840,8 +4955,9 @@ out:
 
 static __be32
 nfsd4_encode_copy_notify(struct nfsd4_compoundres *resp, __be32 nfserr,
-			 struct nfsd4_copy_notify *cn)
+			 union nfsd4_op_u *u)
 {
+	struct nfsd4_copy_notify *cn = &u->copy_notify;
 	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
@@ -4875,8 +4991,9 @@ nfsd4_encode_copy_notify(struct nfsd4_compoundres *resp, __be32 nfserr,
 
 static __be32
 nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
-		  struct nfsd4_seek *seek)
+		  union nfsd4_op_u *u)
 {
+	struct nfsd4_seek *seek = &u->seek;
 	__be32 *p;
 
 	p = xdr_reserve_space(resp->xdr, 4 + 8);
@@ -4887,7 +5004,8 @@ nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
 }
 
 static __be32
-nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
+nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr,
+		  union nfsd4_op_u *p)
 {
 	return nfserr;
 }
@@ -4938,8 +5056,9 @@ nfsd4_vbuf_to_stream(struct xdr_stream *xdr, char *buf, u32 buflen)
 
 static __be32
 nfsd4_encode_getxattr(struct nfsd4_compoundres *resp, __be32 nfserr,
-		      struct nfsd4_getxattr *getxattr)
+		      union nfsd4_op_u *u)
 {
+	struct nfsd4_getxattr *getxattr = &u->getxattr;
 	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p, err;
 
@@ -4962,8 +5081,9 @@ nfsd4_encode_getxattr(struct nfsd4_compoundres *resp, __be32 nfserr,
 
 static __be32
 nfsd4_encode_setxattr(struct nfsd4_compoundres *resp, __be32 nfserr,
-		      struct nfsd4_setxattr *setxattr)
+		      union nfsd4_op_u *u)
 {
+	struct nfsd4_setxattr *setxattr = &u->setxattr;
 	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
@@ -5003,8 +5123,9 @@ nfsd4_listxattr_validate_cookie(struct nfsd4_listxattrs *listxattrs,
 
 static __be32
 nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr,
-			struct nfsd4_listxattrs *listxattrs)
+			union nfsd4_op_u *u)
 {
+	struct nfsd4_listxattrs *listxattrs = &u->listxattrs;
 	struct xdr_stream *xdr = resp->xdr;
 	u32 cookie_offset, count_offset, eof;
 	u32 left, xdrleft, slen, count;
@@ -5114,8 +5235,9 @@ out:
 
 static __be32
 nfsd4_encode_removexattr(struct nfsd4_compoundres *resp, __be32 nfserr,
-			 struct nfsd4_removexattr *removexattr)
+			 union nfsd4_op_u *u)
 {
+	struct nfsd4_removexattr *removexattr = &u->removexattr;
 	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
@@ -5127,7 +5249,7 @@ nfsd4_encode_removexattr(struct nfsd4_compoundres *resp, __be32 nfserr,
 	return 0;
 }
 
-typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *);
+typedef __be32(*nfsd4_enc)(struct nfsd4_compoundres *, __be32, union nfsd4_op_u *u);
 
 /*
  * Note: nfsd4_enc_ops vector is shared for v4.0 and v4.1
@@ -5135,93 +5257,93 @@ typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *);
  * done in the decoding phase.
  */
 static const nfsd4_enc nfsd4_enc_ops[] = {
-	[OP_ACCESS]		= (nfsd4_enc)nfsd4_encode_access,
-	[OP_CLOSE]		= (nfsd4_enc)nfsd4_encode_close,
-	[OP_COMMIT]		= (nfsd4_enc)nfsd4_encode_commit,
-	[OP_CREATE]		= (nfsd4_enc)nfsd4_encode_create,
-	[OP_DELEGPURGE]		= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_DELEGRETURN]	= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_GETATTR]		= (nfsd4_enc)nfsd4_encode_getattr,
-	[OP_GETFH]		= (nfsd4_enc)nfsd4_encode_getfh,
-	[OP_LINK]		= (nfsd4_enc)nfsd4_encode_link,
-	[OP_LOCK]		= (nfsd4_enc)nfsd4_encode_lock,
-	[OP_LOCKT]		= (nfsd4_enc)nfsd4_encode_lockt,
-	[OP_LOCKU]		= (nfsd4_enc)nfsd4_encode_locku,
-	[OP_LOOKUP]		= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_LOOKUPP]		= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_NVERIFY]		= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_OPEN]		= (nfsd4_enc)nfsd4_encode_open,
-	[OP_OPENATTR]		= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_OPEN_CONFIRM]	= (nfsd4_enc)nfsd4_encode_open_confirm,
-	[OP_OPEN_DOWNGRADE]	= (nfsd4_enc)nfsd4_encode_open_downgrade,
-	[OP_PUTFH]		= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_PUTPUBFH]		= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_PUTROOTFH]		= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_READ]		= (nfsd4_enc)nfsd4_encode_read,
-	[OP_READDIR]		= (nfsd4_enc)nfsd4_encode_readdir,
-	[OP_READLINK]		= (nfsd4_enc)nfsd4_encode_readlink,
-	[OP_REMOVE]		= (nfsd4_enc)nfsd4_encode_remove,
-	[OP_RENAME]		= (nfsd4_enc)nfsd4_encode_rename,
-	[OP_RENEW]		= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_RESTOREFH]		= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_SAVEFH]		= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_SECINFO]		= (nfsd4_enc)nfsd4_encode_secinfo,
-	[OP_SETATTR]		= (nfsd4_enc)nfsd4_encode_setattr,
-	[OP_SETCLIENTID]	= (nfsd4_enc)nfsd4_encode_setclientid,
-	[OP_SETCLIENTID_CONFIRM] = (nfsd4_enc)nfsd4_encode_noop,
-	[OP_VERIFY]		= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_WRITE]		= (nfsd4_enc)nfsd4_encode_write,
-	[OP_RELEASE_LOCKOWNER]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_ACCESS]		= nfsd4_encode_access,
+	[OP_CLOSE]		= nfsd4_encode_close,
+	[OP_COMMIT]		= nfsd4_encode_commit,
+	[OP_CREATE]		= nfsd4_encode_create,
+	[OP_DELEGPURGE]		= nfsd4_encode_noop,
+	[OP_DELEGRETURN]	= nfsd4_encode_noop,
+	[OP_GETATTR]		= nfsd4_encode_getattr,
+	[OP_GETFH]		= nfsd4_encode_getfh,
+	[OP_LINK]		= nfsd4_encode_link,
+	[OP_LOCK]		= nfsd4_encode_lock,
+	[OP_LOCKT]		= nfsd4_encode_lockt,
+	[OP_LOCKU]		= nfsd4_encode_locku,
+	[OP_LOOKUP]		= nfsd4_encode_noop,
+	[OP_LOOKUPP]		= nfsd4_encode_noop,
+	[OP_NVERIFY]		= nfsd4_encode_noop,
+	[OP_OPEN]		= nfsd4_encode_open,
+	[OP_OPENATTR]		= nfsd4_encode_noop,
+	[OP_OPEN_CONFIRM]	= nfsd4_encode_open_confirm,
+	[OP_OPEN_DOWNGRADE]	= nfsd4_encode_open_downgrade,
+	[OP_PUTFH]		= nfsd4_encode_noop,
+	[OP_PUTPUBFH]		= nfsd4_encode_noop,
+	[OP_PUTROOTFH]		= nfsd4_encode_noop,
+	[OP_READ]		= nfsd4_encode_read,
+	[OP_READDIR]		= nfsd4_encode_readdir,
+	[OP_READLINK]		= nfsd4_encode_readlink,
+	[OP_REMOVE]		= nfsd4_encode_remove,
+	[OP_RENAME]		= nfsd4_encode_rename,
+	[OP_RENEW]		= nfsd4_encode_noop,
+	[OP_RESTOREFH]		= nfsd4_encode_noop,
+	[OP_SAVEFH]		= nfsd4_encode_noop,
+	[OP_SECINFO]		= nfsd4_encode_secinfo,
+	[OP_SETATTR]		= nfsd4_encode_setattr,
+	[OP_SETCLIENTID]	= nfsd4_encode_setclientid,
+	[OP_SETCLIENTID_CONFIRM] = nfsd4_encode_noop,
+	[OP_VERIFY]		= nfsd4_encode_noop,
+	[OP_WRITE]		= nfsd4_encode_write,
+	[OP_RELEASE_LOCKOWNER]	= nfsd4_encode_noop,
 
 	/* NFSv4.1 operations */
-	[OP_BACKCHANNEL_CTL]	= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_bind_conn_to_session,
-	[OP_EXCHANGE_ID]	= (nfsd4_enc)nfsd4_encode_exchange_id,
-	[OP_CREATE_SESSION]	= (nfsd4_enc)nfsd4_encode_create_session,
-	[OP_DESTROY_SESSION]	= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_FREE_STATEID]	= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_GET_DIR_DELEGATION]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_BACKCHANNEL_CTL]	= nfsd4_encode_noop,
+	[OP_BIND_CONN_TO_SESSION] = nfsd4_encode_bind_conn_to_session,
+	[OP_EXCHANGE_ID]	= nfsd4_encode_exchange_id,
+	[OP_CREATE_SESSION]	= nfsd4_encode_create_session,
+	[OP_DESTROY_SESSION]	= nfsd4_encode_noop,
+	[OP_FREE_STATEID]	= nfsd4_encode_noop,
+	[OP_GET_DIR_DELEGATION]	= nfsd4_encode_noop,
 #ifdef CONFIG_NFSD_PNFS
-	[OP_GETDEVICEINFO]	= (nfsd4_enc)nfsd4_encode_getdeviceinfo,
-	[OP_GETDEVICELIST]	= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_LAYOUTCOMMIT]	= (nfsd4_enc)nfsd4_encode_layoutcommit,
-	[OP_LAYOUTGET]		= (nfsd4_enc)nfsd4_encode_layoutget,
-	[OP_LAYOUTRETURN]	= (nfsd4_enc)nfsd4_encode_layoutreturn,
+	[OP_GETDEVICEINFO]	= nfsd4_encode_getdeviceinfo,
+	[OP_GETDEVICELIST]	= nfsd4_encode_noop,
+	[OP_LAYOUTCOMMIT]	= nfsd4_encode_layoutcommit,
+	[OP_LAYOUTGET]		= nfsd4_encode_layoutget,
+	[OP_LAYOUTRETURN]	= nfsd4_encode_layoutreturn,
 #else
-	[OP_GETDEVICEINFO]	= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_GETDEVICELIST]	= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_LAYOUTCOMMIT]	= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_LAYOUTGET]		= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_LAYOUTRETURN]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_GETDEVICEINFO]	= nfsd4_encode_noop,
+	[OP_GETDEVICELIST]	= nfsd4_encode_noop,
+	[OP_LAYOUTCOMMIT]	= nfsd4_encode_noop,
+	[OP_LAYOUTGET]		= nfsd4_encode_noop,
+	[OP_LAYOUTRETURN]	= nfsd4_encode_noop,
 #endif
-	[OP_SECINFO_NO_NAME]	= (nfsd4_enc)nfsd4_encode_secinfo_no_name,
-	[OP_SEQUENCE]		= (nfsd4_enc)nfsd4_encode_sequence,
-	[OP_SET_SSV]		= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_TEST_STATEID]	= (nfsd4_enc)nfsd4_encode_test_stateid,
-	[OP_WANT_DELEGATION]	= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_DESTROY_CLIENTID]	= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_RECLAIM_COMPLETE]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_SECINFO_NO_NAME]	= nfsd4_encode_secinfo_no_name,
+	[OP_SEQUENCE]		= nfsd4_encode_sequence,
+	[OP_SET_SSV]		= nfsd4_encode_noop,
+	[OP_TEST_STATEID]	= nfsd4_encode_test_stateid,
+	[OP_WANT_DELEGATION]	= nfsd4_encode_noop,
+	[OP_DESTROY_CLIENTID]	= nfsd4_encode_noop,
+	[OP_RECLAIM_COMPLETE]	= nfsd4_encode_noop,
 
 	/* NFSv4.2 operations */
-	[OP_ALLOCATE]		= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_COPY]		= (nfsd4_enc)nfsd4_encode_copy,
-	[OP_COPY_NOTIFY]	= (nfsd4_enc)nfsd4_encode_copy_notify,
-	[OP_DEALLOCATE]		= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_IO_ADVISE]		= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_LAYOUTERROR]	= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_LAYOUTSTATS]	= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_OFFLOAD_CANCEL]	= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_OFFLOAD_STATUS]	= (nfsd4_enc)nfsd4_encode_offload_status,
-	[OP_READ_PLUS]		= (nfsd4_enc)nfsd4_encode_read_plus,
-	[OP_SEEK]		= (nfsd4_enc)nfsd4_encode_seek,
-	[OP_WRITE_SAME]		= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_CLONE]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_ALLOCATE]		= nfsd4_encode_noop,
+	[OP_COPY]		= nfsd4_encode_copy,
+	[OP_COPY_NOTIFY]	= nfsd4_encode_copy_notify,
+	[OP_DEALLOCATE]		= nfsd4_encode_noop,
+	[OP_IO_ADVISE]		= nfsd4_encode_noop,
+	[OP_LAYOUTERROR]	= nfsd4_encode_noop,
+	[OP_LAYOUTSTATS]	= nfsd4_encode_noop,
+	[OP_OFFLOAD_CANCEL]	= nfsd4_encode_noop,
+	[OP_OFFLOAD_STATUS]	= nfsd4_encode_offload_status,
+	[OP_READ_PLUS]		= nfsd4_encode_read_plus,
+	[OP_SEEK]		= nfsd4_encode_seek,
+	[OP_WRITE_SAME]		= nfsd4_encode_noop,
+	[OP_CLONE]		= nfsd4_encode_noop,
 
 	/* RFC 8276 extended atributes operations */
-	[OP_GETXATTR]		= (nfsd4_enc)nfsd4_encode_getxattr,
-	[OP_SETXATTR]		= (nfsd4_enc)nfsd4_encode_setxattr,
-	[OP_LISTXATTRS]		= (nfsd4_enc)nfsd4_encode_listxattrs,
-	[OP_REMOVEXATTR]	= (nfsd4_enc)nfsd4_encode_removexattr,
+	[OP_GETXATTR]		= nfsd4_encode_getxattr,
+	[OP_SETXATTR]		= nfsd4_encode_setxattr,
+	[OP_LISTXATTRS]		= nfsd4_encode_listxattrs,
+	[OP_REMOVEXATTR]	= nfsd4_encode_removexattr,
 };
 
 /*
-- 
cgit 


From 7fd461c47c6cfab4ca4d003790ec276209e52978 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@Netapp.com>
Date: Wed, 30 Nov 2022 13:15:27 -0500
Subject: NFSv4.2: Change the default KConfig value for READ_PLUS

Now that we've worked out performance issues and have a server patch
addressing the failed xfstests, we can safely enable this feature by
default.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/Kconfig | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 14a72224b657..1ead5bd740c2 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -209,8 +209,8 @@ config NFS_DISABLE_UDP_SUPPORT
 config NFS_V4_2_READ_PLUS
 	bool "NFS: Enable support for the NFSv4.2 READ_PLUS operation"
 	depends on NFS_V4_2
-	default n
+	default y
 	help
-	 This is intended for developers only. The READ_PLUS operation has
-	 been shown to have issues under specific conditions and should not
-	 be used in production.
+	 Choose Y here to enable the use of READ_PLUS over NFS v4.2. READ_PLUS
+	 attempts to improve read performance by compressing out sparse holes
+	 in the file contents.
-- 
cgit 


From e0c49bd2b4d3cd1751491eb2d940bce968ac65e9 Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Fri, 9 Dec 2022 18:04:48 +0800
Subject: fs: sysv: Fix sysv_nblocks() returns wrong value

sysv_nblocks() returns 'blocks' rather than 'res', which only counting
the number of triple-indirect blocks and causing sysv_getattr() gets a
wrong result.

[AV: this is actually a sysv counterpart of minixfs fix -
0fcd426de9d0 "[PATCH] minix block usage counting fix" in
historical tree; mea culpa, should've thought to check
fs/sysv back then...]

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/sysv/itree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index d4ec9bb97de9..3b8567564e7e 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -438,7 +438,7 @@ static unsigned sysv_nblocks(struct super_block *s, loff_t size)
 		res += blocks;
 		direct = 1;
 	}
-	return blocks;
+	return res;
 }
 
 int sysv_getattr(struct user_namespace *mnt_userns, const struct path *path,
-- 
cgit 


From 37ba7b005a7a4454046bd8659c7a9c5330552396 Mon Sep 17 00:00:00 2001
From: Namjae Jeon <linkinjeon@kernel.org>
Date: Sat, 29 Oct 2022 00:01:38 +0900
Subject: ksmbd: set SMB2_SESSION_FLAG_ENCRYPT_DATA when enforcing data
 encryption for this share

Currently, SMB2_SESSION_FLAG_ENCRYPT_DATA is always set session setup
response. Since this forces data encryption from the client, there is a
problem that data is always encrypted regardless of the use of the cifs
seal mount option. SMB2_SESSION_FLAG_ENCRYPT_DATA should be set according
to KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION flags, and in case of
KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF, encryption mode is turned off for
all connections.

Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/ksmbd/ksmbd_netlink.h |  1 +
 fs/ksmbd/smb2ops.c       | 10 ++++++++--
 fs/ksmbd/smb2pdu.c       |  8 +++++---
 3 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h
index ff07c67f4565..b6bd8311e6b4 100644
--- a/fs/ksmbd/ksmbd_netlink.h
+++ b/fs/ksmbd/ksmbd_netlink.h
@@ -74,6 +74,7 @@ struct ksmbd_heartbeat {
 #define KSMBD_GLOBAL_FLAG_SMB2_LEASES		BIT(0)
 #define KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION	BIT(1)
 #define KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL	BIT(2)
+#define KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF	BIT(3)
 
 /*
  * IPC request for ksmbd server startup
diff --git a/fs/ksmbd/smb2ops.c b/fs/ksmbd/smb2ops.c
index ab23da2120b9..e401302478c3 100644
--- a/fs/ksmbd/smb2ops.c
+++ b/fs/ksmbd/smb2ops.c
@@ -247,8 +247,9 @@ void init_smb3_02_server(struct ksmbd_conn *conn)
 	if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES)
 		conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING;
 
-	if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION &&
-	    conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION)
+	if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION ||
+	    (!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF) &&
+	     conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION))
 		conn->vals->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION;
 
 	if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL)
@@ -271,6 +272,11 @@ int init_smb3_11_server(struct ksmbd_conn *conn)
 	if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES)
 		conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING;
 
+	if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION ||
+	    (!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF) &&
+	     conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION))
+		conn->vals->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION;
+
 	if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL)
 		conn->vals->capabilities |= SMB2_GLOBAL_CAP_MULTI_CHANNEL;
 
diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
index b2fc85d440d0..56d68ddc409c 100644
--- a/fs/ksmbd/smb2pdu.c
+++ b/fs/ksmbd/smb2pdu.c
@@ -903,7 +903,7 @@ static void decode_encrypt_ctxt(struct ksmbd_conn *conn,
 		return;
 	}
 
-	if (!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION))
+	if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF)
 		return;
 
 	for (i = 0; i < cph_cnt; i++) {
@@ -1508,7 +1508,8 @@ static int ntlm_authenticate(struct ksmbd_work *work)
 			return -EINVAL;
 		}
 		sess->enc = true;
-		rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE;
+		if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION)
+			rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE;
 		/*
 		 * signing is disable if encryption is enable
 		 * on this session
@@ -1599,7 +1600,8 @@ static int krb5_authenticate(struct ksmbd_work *work)
 			return -EINVAL;
 		}
 		sess->enc = true;
-		rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE;
+		if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION)
+			rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE;
 		sess->sign = false;
 	}
 
-- 
cgit 


From 7ecbe92696bb7fe32c80b6cf64736a0d157717a9 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 11 Nov 2022 08:11:53 -0500
Subject: ksmbd: use F_SETLK when unlocking a file

ksmbd seems to be trying to use a cmd value of 0 when unlocking a file.
That activity requires a type of F_UNLCK with a cmd of F_SETLK. For
local POSIX locking, it doesn't matter much since vfs_lock_file ignores
@cmd, but filesystems that define their own ->lock operation expect to
see it set sanely.

Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/ksmbd/smb2pdu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
index 56d68ddc409c..de8e367095c9 100644
--- a/fs/ksmbd/smb2pdu.c
+++ b/fs/ksmbd/smb2pdu.c
@@ -6753,7 +6753,7 @@ static int smb2_set_flock_flags(struct file_lock *flock, int flags)
 	case SMB2_LOCKFLAG_UNLOCK:
 		ksmbd_debug(SMB, "received unlock request\n");
 		flock->fl_type = F_UNLCK;
-		cmd = 0;
+		cmd = F_SETLK;
 		break;
 	}
 
@@ -7131,7 +7131,7 @@ out:
 		rlock->fl_start = smb_lock->start;
 		rlock->fl_end = smb_lock->end;
 
-		rc = vfs_lock_file(filp, 0, rlock, NULL);
+		rc = vfs_lock_file(filp, F_SETLK, rlock, NULL);
 		if (rc)
 			pr_err("rollback unlock fail : %d\n", rc);
 
-- 
cgit 


From 30429388531b120902126a39cf64f877fc5c7773 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Tue, 15 Nov 2022 09:35:10 -0600
Subject: ksmbd: replace one-element arrays with flexible-array members

One-element arrays are deprecated, and we are replacing them with flexible
array members instead. So, replace one-element arrays with flexible-array
members in multiple structs in fs/ksmbd/smb_common.h and one in
fs/ksmbd/smb2pdu.h.

Important to mention is that doing a build before/after this patch results
in no binary output differences.

This helps with the ongoing efforts to tighten the FORTIFY_SOURCE routines
on memcpy() and help us make progress towards globally enabling
-fstrict-flex-arrays=3 [1].

Link: https://github.com/KSPP/linux/issues/242
Link: https://github.com/KSPP/linux/issues/79
Link: https://gcc.gnu.org/pipermail/gcc-patches/2022-October/602902.html [1]
Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/ksmbd/smb2pdu.c    |  4 ++--
 fs/ksmbd/smb2pdu.h    |  2 +-
 fs/ksmbd/smb_common.h | 12 ++++++------
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
index de8e367095c9..79261493a212 100644
--- a/fs/ksmbd/smb2pdu.c
+++ b/fs/ksmbd/smb2pdu.c
@@ -3440,7 +3440,7 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level,
 		goto free_conv_name;
 	}
 
-	struct_sz = readdir_info_level_struct_sz(info_level) - 1 + conv_len;
+	struct_sz = readdir_info_level_struct_sz(info_level) + conv_len;
 	next_entry_offset = ALIGN(struct_sz, KSMBD_DIR_INFO_ALIGNMENT);
 	d_info->last_entry_off_align = next_entry_offset - struct_sz;
 
@@ -3692,7 +3692,7 @@ static int reserve_populate_dentry(struct ksmbd_dir_info *d_info,
 		return -EOPNOTSUPP;
 
 	conv_len = (d_info->name_len + 1) * 2;
-	next_entry_offset = ALIGN(struct_sz - 1 + conv_len,
+	next_entry_offset = ALIGN(struct_sz + conv_len,
 				  KSMBD_DIR_INFO_ALIGNMENT);
 
 	if (next_entry_offset > d_info->out_buf_len) {
diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h
index 092fdd3f8750..aa5dbe54f5a1 100644
--- a/fs/ksmbd/smb2pdu.h
+++ b/fs/ksmbd/smb2pdu.h
@@ -443,7 +443,7 @@ struct smb2_posix_info {
 	/* SidBuffer contain two sids (UNIX user sid(16), UNIX group sid(16)) */
 	u8 SidBuffer[32];
 	__le32 name_len;
-	u8 name[1];
+	u8 name[];
 	/*
 	 * var sized owner SID
 	 * var sized group SID
diff --git a/fs/ksmbd/smb_common.h b/fs/ksmbd/smb_common.h
index 318c16fa81da..e663ab9ea759 100644
--- a/fs/ksmbd/smb_common.h
+++ b/fs/ksmbd/smb_common.h
@@ -277,14 +277,14 @@ struct file_directory_info {
 	__le64 AllocationSize;
 	__le32 ExtFileAttributes;
 	__le32 FileNameLength;
-	char FileName[1];
+	char FileName[];
 } __packed;   /* level 0x101 FF resp data */
 
 struct file_names_info {
 	__le32 NextEntryOffset;
 	__u32 FileIndex;
 	__le32 FileNameLength;
-	char FileName[1];
+	char FileName[];
 } __packed;   /* level 0xc FF resp data */
 
 struct file_full_directory_info {
@@ -299,7 +299,7 @@ struct file_full_directory_info {
 	__le32 ExtFileAttributes;
 	__le32 FileNameLength;
 	__le32 EaSize;
-	char FileName[1];
+	char FileName[];
 } __packed; /* level 0x102 FF resp */
 
 struct file_both_directory_info {
@@ -317,7 +317,7 @@ struct file_both_directory_info {
 	__u8   ShortNameLength;
 	__u8   Reserved;
 	__u8   ShortName[24];
-	char FileName[1];
+	char FileName[];
 } __packed; /* level 0x104 FFrsp data */
 
 struct file_id_both_directory_info {
@@ -337,7 +337,7 @@ struct file_id_both_directory_info {
 	__u8   ShortName[24];
 	__le16 Reserved2;
 	__le64 UniqueId;
-	char FileName[1];
+	char FileName[];
 } __packed;
 
 struct file_id_full_dir_info {
@@ -354,7 +354,7 @@ struct file_id_full_dir_info {
 	__le32 EaSize; /* EA size */
 	__le32 Reserved;
 	__le64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit*/
-	char FileName[1];
+	char FileName[];
 } __packed; /* level 0x105 FF rsp data */
 
 struct smb_version_values {
-- 
cgit 


From bc044414fa0326a4e5c3c509c00b1fcaf621b5f4 Mon Sep 17 00:00:00 2001
From: Xiu Jianfeng <xiujianfeng@huawei.com>
Date: Wed, 16 Nov 2022 20:22:37 +0800
Subject: ksmbd: Fix resource leak in ksmbd_session_rpc_open()

When ksmbd_rpc_open() fails then it must call ksmbd_rpc_id_free() to
undo the result of ksmbd_ipc_id_alloc().

Fixes: e2f34481b24d ("cifsd: add server-side procedures for SMB3")
Signed-off-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/ksmbd/mgmt/user_session.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ksmbd/mgmt/user_session.c b/fs/ksmbd/mgmt/user_session.c
index 3fa2139a0b30..92b1603b5abe 100644
--- a/fs/ksmbd/mgmt/user_session.c
+++ b/fs/ksmbd/mgmt/user_session.c
@@ -108,15 +108,17 @@ int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name)
 	entry->method = method;
 	entry->id = ksmbd_ipc_id_alloc();
 	if (entry->id < 0)
-		goto error;
+		goto free_entry;
 
 	resp = ksmbd_rpc_open(sess, entry->id);
 	if (!resp)
-		goto error;
+		goto free_id;
 
 	kvfree(resp);
 	return entry->id;
-error:
+free_id:
+	ksmbd_rpc_id_free(entry->id);
+free_entry:
 	list_del(&entry->list);
 	kfree(entry);
 	return -EINVAL;
-- 
cgit 


From 01f6c61bae3d658058ee6322af77acea26a5ee3a Mon Sep 17 00:00:00 2001
From: Marios Makassikis <mmakassikis@freebox.fr>
Date: Tue, 29 Nov 2022 12:19:33 +0100
Subject: ksmbd: Fix resource leak in smb2_lock()

"flock" is leaked if an error happens before smb2_lock_init(), as the
lock is not added to the lock_list to be cleaned up.

Signed-off-by: Marios Makassikis <mmakassikis@freebox.fr>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/ksmbd/smb2pdu.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
index 79261493a212..88d2c23369fe 100644
--- a/fs/ksmbd/smb2pdu.c
+++ b/fs/ksmbd/smb2pdu.c
@@ -6857,6 +6857,7 @@ int smb2_lock(struct ksmbd_work *work)
 		if (lock_start > U64_MAX - lock_length) {
 			pr_err("Invalid lock range requested\n");
 			rsp->hdr.Status = STATUS_INVALID_LOCK_RANGE;
+			locks_free_lock(flock);
 			goto out;
 		}
 
@@ -6876,6 +6877,7 @@ int smb2_lock(struct ksmbd_work *work)
 				    "the end offset(%llx) is smaller than the start offset(%llx)\n",
 				    flock->fl_end, flock->fl_start);
 			rsp->hdr.Status = STATUS_INVALID_LOCK_RANGE;
+			locks_free_lock(flock);
 			goto out;
 		}
 
@@ -6887,6 +6889,7 @@ int smb2_lock(struct ksmbd_work *work)
 				    flock->fl_type != F_UNLCK) {
 					pr_err("conflict two locks in one request\n");
 					err = -EINVAL;
+					locks_free_lock(flock);
 					goto out;
 				}
 			}
@@ -6895,6 +6898,7 @@ int smb2_lock(struct ksmbd_work *work)
 		smb_lock = smb2_lock_init(flock, cmd, flags, &lock_list);
 		if (!smb_lock) {
 			err = -EINVAL;
+			locks_free_lock(flock);
 			goto out;
 		}
 	}
-- 
cgit 


From 72ee45fd46d0d3578c4e6046f66fae3218543ce3 Mon Sep 17 00:00:00 2001
From: ye xingchen <ye.xingchen@zte.com.cn>
Date: Wed, 7 Dec 2022 09:29:27 +0800
Subject: ksmbd: Convert to use sysfs_emit()/sysfs_emit_at() APIs

Follow the advice of the Documentation/filesystems/sysfs.rst and show()
should only use sysfs_emit() or sysfs_emit_at() when formatting the
value to be returned to user space.

Signed-off-by: ye xingchen <ye.xingchen@zte.com.cn>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/ksmbd/server.c | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ksmbd/server.c b/fs/ksmbd/server.c
index a0d635304754..394b6ceac431 100644
--- a/fs/ksmbd/server.c
+++ b/fs/ksmbd/server.c
@@ -432,11 +432,9 @@ static ssize_t stats_show(struct class *class, struct class_attribute *attr,
 		"reset",
 		"shutdown"
 	};
-
-	ssize_t sz = scnprintf(buf, PAGE_SIZE, "%d %s %d %lu\n", stats_version,
-			       state[server_conf.state], server_conf.tcp_port,
-			       server_conf.ipc_last_active / HZ);
-	return sz;
+	return sysfs_emit(buf, "%d %s %d %lu\n", stats_version,
+			  state[server_conf.state], server_conf.tcp_port,
+			  server_conf.ipc_last_active / HZ);
 }
 
 static ssize_t kill_server_store(struct class *class,
@@ -468,19 +466,13 @@ static ssize_t debug_show(struct class *class, struct class_attribute *attr,
 
 	for (i = 0; i < ARRAY_SIZE(debug_type_strings); i++) {
 		if ((ksmbd_debug_types >> i) & 1) {
-			pos = scnprintf(buf + sz,
-					PAGE_SIZE - sz,
-					"[%s] ",
-					debug_type_strings[i]);
+			pos = sysfs_emit_at(buf, sz, "[%s] ", debug_type_strings[i]);
 		} else {
-			pos = scnprintf(buf + sz,
-					PAGE_SIZE - sz,
-					"%s ",
-					debug_type_strings[i]);
+			pos = sysfs_emit_at(buf, sz, "%s ", debug_type_strings[i]);
 		}
 		sz += pos;
 	}
-	sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n");
+	sz += sysfs_emit_at(buf, sz, "\n");
 	return sz;
 }
 
-- 
cgit 


From ff39899be80b9d90d5e13775eb9fd150338b6e15 Mon Sep 17 00:00:00 2001
From: Yuezhang Mo <Yuezhang.Mo@sony.com>
Date: Thu, 21 Jul 2022 09:59:32 +0800
Subject: exfat: simplify empty entry hint

This commit adds exfat_set_empty_hint()/exfat_reset_empty_hint()
to reduce code complexity and make code more readable.

Signed-off-by: Yuezhang Mo <Yuezhang.Mo@sony.com>
Reviewed-by: Andy Wu <Andy.Wu@sony.com>
Reviewed-by: Aoyama Wataru <wataru.aoyama@sony.com>
Reviewed-by: Sungjong Seo <sj1557.seo@samsung.com>
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
---
 fs/exfat/dir.c | 58 +++++++++++++++++++++++++++++++---------------------------
 1 file changed, 31 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index 0fc08fdcba73..c0e60ff9ec7d 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -897,6 +897,29 @@ free_es:
 	return NULL;
 }
 
+static inline void exfat_reset_empty_hint(struct exfat_hint_femp *hint_femp)
+{
+	hint_femp->eidx = EXFAT_HINT_NONE;
+	hint_femp->count = 0;
+}
+
+static inline void exfat_set_empty_hint(struct exfat_inode_info *ei,
+		struct exfat_hint_femp *candi_empty, struct exfat_chain *clu,
+		int dentry, int num_entries)
+{
+	if (ei->hint_femp.eidx == EXFAT_HINT_NONE ||
+	    ei->hint_femp.eidx > dentry) {
+		if (candi_empty->count == 0) {
+			candi_empty->cur = *clu;
+			candi_empty->eidx = dentry;
+		}
+
+		candi_empty->count++;
+		if (candi_empty->count == num_entries)
+			ei->hint_femp = *candi_empty;
+	}
+}
+
 enum {
 	DIRENT_STEP_FILE,
 	DIRENT_STEP_STRM,
@@ -921,7 +944,7 @@ int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei,
 {
 	int i, rewind = 0, dentry = 0, end_eidx = 0, num_ext = 0, len;
 	int order, step, name_len = 0;
-	int dentries_per_clu, num_empty = 0;
+	int dentries_per_clu;
 	unsigned int entry_type;
 	unsigned short *uniname = NULL;
 	struct exfat_chain clu;
@@ -939,10 +962,13 @@ int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei,
 		end_eidx = dentry;
 	}
 
-	candi_empty.eidx = EXFAT_HINT_NONE;
+	exfat_reset_empty_hint(&ei->hint_femp);
+
 rewind:
 	order = 0;
 	step = DIRENT_STEP_FILE;
+	exfat_reset_empty_hint(&candi_empty);
+
 	while (clu.dir != EXFAT_EOF_CLUSTER) {
 		i = dentry & (dentries_per_clu - 1);
 		for (; i < dentries_per_clu; i++, dentry++) {
@@ -962,26 +988,8 @@ rewind:
 			    entry_type == TYPE_DELETED) {
 				step = DIRENT_STEP_FILE;
 
-				num_empty++;
-				if (candi_empty.eidx == EXFAT_HINT_NONE &&
-						num_empty == 1) {
-					exfat_chain_set(&candi_empty.cur,
-						clu.dir, clu.size, clu.flags);
-				}
-
-				if (candi_empty.eidx == EXFAT_HINT_NONE &&
-						num_empty >= num_entries) {
-					candi_empty.eidx =
-						dentry - (num_empty - 1);
-					WARN_ON(candi_empty.eidx < 0);
-					candi_empty.count = num_empty;
-
-					if (ei->hint_femp.eidx ==
-							EXFAT_HINT_NONE ||
-						candi_empty.eidx <=
-							 ei->hint_femp.eidx)
-						ei->hint_femp = candi_empty;
-				}
+				exfat_set_empty_hint(ei, &candi_empty, &clu,
+						dentry, num_entries);
 
 				brelse(bh);
 				if (entry_type == TYPE_UNUSED)
@@ -989,8 +997,7 @@ rewind:
 				continue;
 			}
 
-			num_empty = 0;
-			candi_empty.eidx = EXFAT_HINT_NONE;
+			exfat_reset_empty_hint(&candi_empty);
 
 			if (entry_type == TYPE_FILE || entry_type == TYPE_DIR) {
 				step = DIRENT_STEP_FILE;
@@ -1090,9 +1097,6 @@ not_found:
 		rewind = 1;
 		dentry = 0;
 		clu.dir = p_dir->dir;
-		/* reset empty hint */
-		num_empty = 0;
-		candi_empty.eidx = EXFAT_HINT_NONE;
 		goto rewind;
 	}
 
-- 
cgit 


From e298c8a818a3e517582e60c412f4a41b3a1647c5 Mon Sep 17 00:00:00 2001
From: Yuezhang Mo <Yuezhang.Mo@sony.com>
Date: Mon, 7 Nov 2022 17:22:13 +0900
Subject: exfat: hint the empty entry which at the end of cluster chain

After traversing all directory entries, hint the empty directory
entry no matter whether or not there are enough empty directory
entries.

After this commit, hint the empty directory entries like this:

1. Hint the deleted directory entries if enough;
2. Hint the deleted and unused directory entries which at the
   end of the cluster chain no matter whether enough or not(Add
   by this commit);
3. If no any empty directory entries, hint the empty directory
   entries in the new cluster(Add by this commit).

This avoids repeated traversal of directory entries, reduces CPU
usage, and improves the performance of creating files and
directories(especially on low-performance CPUs).

Test create 5000 files in a class 4 SD card on imx6q-sabrelite
with:

for ((i=0;i<5;i++)); do
   sync
   time (for ((j=1;j<=1000;j++)); do touch file$((i*1000+j)); done)
done

The more files, the more performance improvements.

            Before   After    Improvement
   1~1000   25.360s  22.168s  14.40%
1001~2000   38.242s  28.72ss  33.15%
2001~3000   49.134s  35.037s  40.23%
3001~4000   62.042s  41.624s  49.05%
4001~5000   73.629s  46.772s  57.42%

Signed-off-by: Yuezhang Mo <Yuezhang.Mo@sony.com>
Reviewed-by: Andy Wu <Andy.Wu@sony.com>
Reviewed-by: Aoyama Wataru <wataru.aoyama@sony.com>
Reviewed-by: Sungjong Seo <sj1557.seo@samsung.com>
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
---
 fs/exfat/dir.c   | 26 ++++++++++++++++++++++----
 fs/exfat/namei.c | 33 +++++++++++++++++++++------------
 2 files changed, 43 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index c0e60ff9ec7d..30d0ac43b66c 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -905,17 +905,24 @@ static inline void exfat_reset_empty_hint(struct exfat_hint_femp *hint_femp)
 
 static inline void exfat_set_empty_hint(struct exfat_inode_info *ei,
 		struct exfat_hint_femp *candi_empty, struct exfat_chain *clu,
-		int dentry, int num_entries)
+		int dentry, int num_entries, int entry_type)
 {
 	if (ei->hint_femp.eidx == EXFAT_HINT_NONE ||
 	    ei->hint_femp.eidx > dentry) {
+		int total_entries = EXFAT_B_TO_DEN(i_size_read(&ei->vfs_inode));
+
 		if (candi_empty->count == 0) {
 			candi_empty->cur = *clu;
 			candi_empty->eidx = dentry;
 		}
 
-		candi_empty->count++;
-		if (candi_empty->count == num_entries)
+		if (entry_type == TYPE_UNUSED)
+			candi_empty->count += total_entries - dentry;
+		else
+			candi_empty->count++;
+
+		if (candi_empty->count == num_entries ||
+		    candi_empty->count + candi_empty->eidx == total_entries)
 			ei->hint_femp = *candi_empty;
 	}
 }
@@ -989,7 +996,8 @@ rewind:
 				step = DIRENT_STEP_FILE;
 
 				exfat_set_empty_hint(ei, &candi_empty, &clu,
-						dentry, num_entries);
+						dentry, num_entries,
+						entry_type);
 
 				brelse(bh);
 				if (entry_type == TYPE_UNUSED)
@@ -1100,6 +1108,16 @@ not_found:
 		goto rewind;
 	}
 
+	/*
+	 * set the EXFAT_EOF_CLUSTER flag to avoid search
+	 * from the beginning again when allocated a new cluster
+	 */
+	if (ei->hint_femp.eidx == EXFAT_HINT_NONE) {
+		ei->hint_femp.cur.dir = EXFAT_EOF_CLUSTER;
+		ei->hint_femp.eidx = p_dir->size * dentries_per_clu;
+		ei->hint_femp.count = 0;
+	}
+
 	/* initialized hint_stat */
 	hint_stat->clu = p_dir->dir;
 	hint_stat->eidx = 0;
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index b617bebc3d0f..99e00a36c029 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -224,11 +224,18 @@ static int exfat_search_empty_slot(struct super_block *sb,
 
 	if (hint_femp->eidx != EXFAT_HINT_NONE) {
 		dentry = hint_femp->eidx;
-		if (num_entries <= hint_femp->count) {
-			hint_femp->eidx = EXFAT_HINT_NONE;
-			return dentry;
-		}
 
+		/*
+		 * If hint_femp->count is enough, it is needed to check if
+		 * there are actual empty entries.
+		 * Otherwise, and if "dentry + hint_famp->count" is also equal
+		 * to "p_dir->size * dentries_per_clu", it means ENOSPC.
+		 */
+		if (dentry + hint_femp->count == p_dir->size * dentries_per_clu &&
+		    num_entries > hint_femp->count)
+			return -ENOSPC;
+
+		hint_femp->eidx = EXFAT_HINT_NONE;
 		exfat_chain_dup(&clu, &hint_femp->cur);
 	} else {
 		exfat_chain_dup(&clu, p_dir);
@@ -293,6 +300,12 @@ static int exfat_search_empty_slot(struct super_block *sb,
 		}
 	}
 
+	hint_femp->eidx = p_dir->size * dentries_per_clu - num_empty;
+	hint_femp->count = num_empty;
+	if (num_empty == 0)
+		exfat_chain_set(&hint_femp->cur, EXFAT_EOF_CLUSTER, 0,
+				clu.flags);
+
 	return -ENOSPC;
 }
 
@@ -369,15 +382,11 @@ static int exfat_find_empty_entry(struct inode *inode,
 			if (exfat_ent_set(sb, last_clu, clu.dir))
 				return -EIO;
 
-		if (hint_femp.eidx == EXFAT_HINT_NONE) {
-			/* the special case that new dentry
-			 * should be allocated from the start of new cluster
-			 */
-			hint_femp.eidx = EXFAT_B_TO_DEN_IDX(p_dir->size, sbi);
-			hint_femp.count = sbi->dentries_per_clu;
-
+		if (hint_femp.cur.dir == EXFAT_EOF_CLUSTER)
 			exfat_chain_set(&hint_femp.cur, clu.dir, 0, clu.flags);
-		}
+
+		hint_femp.count += sbi->dentries_per_clu;
+
 		hint_femp.cur.size++;
 		p_dir->size++;
 		size = EXFAT_CLU_TO_B(p_dir->size, sbi);
-- 
cgit 


From f83d8a3b532097276266b5e81073ea46e27b17ab Mon Sep 17 00:00:00 2001
From: Yuezhang Mo <Yuezhang.Mo@sony.com>
Date: Thu, 7 Apr 2022 15:55:56 +0800
Subject: exfat: reduce the size of exfat_entry_set_cache

In normal, there are 19 directory entries at most for a file or
a directory.
  - A file directory entry
  - A stream extension directory entry
  - 1~17 file name directory entry

So the directory entries are in 3 sectors at most, it is enough
for struct exfat_entry_set_cache to pre-allocate 3 bh.

This commit changes the size of struct exfat_entry_set_cache as:

                   Before   After
32-bit system      88       32    bytes
64-bit system      168      48    bytes

Signed-off-by: Yuezhang Mo <Yuezhang.Mo@sony.com>
Reviewed-by: Andy Wu <Andy.Wu@sony.com>
Reviewed-by: Aoyama Wataru <wataru.aoyama@sony.com>
Reviewed-by: Sungjong Seo <sj1557.seo@samsung.com>
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
---
 fs/exfat/exfat_fs.h | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index a8f8eee4937c..af55018ff22e 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -9,6 +9,7 @@
 #include <linux/fs.h>
 #include <linux/ratelimit.h>
 #include <linux/nls.h>
+#include <linux/blkdev.h>
 
 #define EXFAT_ROOT_INO		1
 
@@ -41,6 +42,14 @@ enum {
 #define ES_2_ENTRIES		2
 #define ES_ALL_ENTRIES		0
 
+#define ES_IDX_FILE		0
+#define ES_IDX_STREAM		1
+#define ES_IDX_FIRST_FILENAME	2
+#define EXFAT_FILENAME_ENTRY_NUM(name_len) \
+	DIV_ROUND_UP(name_len, EXFAT_FILE_NAME_LEN)
+#define ES_IDX_LAST_FILENAME(name_len)	\
+	(ES_IDX_FIRST_FILENAME + EXFAT_FILENAME_ENTRY_NUM(name_len) - 1)
+
 #define DIR_DELETED		0xFFFF0321
 
 /* type values */
@@ -68,9 +77,6 @@ enum {
 #define MAX_NAME_LENGTH		255 /* max len of file name excluding NULL */
 #define MAX_VFSNAME_BUF_SIZE	((MAX_NAME_LENGTH + 1) * MAX_CHARSET_SIZE)
 
-/* Enough size to hold 256 dentry (even 512 Byte sector) */
-#define DIR_CACHE_SIZE		(256*sizeof(struct exfat_dentry)/512+1)
-
 #define EXFAT_HINT_NONE		-1
 #define EXFAT_MIN_SUBDIR	2
 
@@ -125,6 +131,17 @@ enum {
 #define BITS_PER_BYTE_MASK	0x7
 #define IGNORED_BITS_REMAINED(clu, clu_base) ((1 << ((clu) - (clu_base))) - 1)
 
+#define ES_ENTRY_NUM(name_len)	(ES_IDX_LAST_FILENAME(name_len) + 1)
+/* 19 entries = 1 file entry + 1 stream entry + 17 filename entries */
+#define ES_MAX_ENTRY_NUM	ES_ENTRY_NUM(MAX_NAME_LENGTH)
+
+/*
+ * 19 entries x 32 bytes/entry = 608 bytes.
+ * The 608 bytes are in 3 sectors at most (even 512 Byte sector).
+ */
+#define DIR_CACHE_SIZE		\
+	(DIV_ROUND_UP(EXFAT_DEN_TO_B(ES_MAX_ENTRY_NUM), SECTOR_SIZE) + 1)
+
 struct exfat_dentry_namebuf {
 	char *lfn;
 	int lfnbuf_len; /* usually MAX_UNINAME_BUF_SIZE */
@@ -166,11 +183,11 @@ struct exfat_hint {
 
 struct exfat_entry_set_cache {
 	struct super_block *sb;
-	bool modified;
 	unsigned int start_off;
 	int num_bh;
 	struct buffer_head *bh[DIR_CACHE_SIZE];
 	unsigned int num_entries;
+	bool modified;
 };
 
 struct exfat_dir_entry {
-- 
cgit 


From a3ff29a95fde16906304455aa8c0bd84eb770258 Mon Sep 17 00:00:00 2001
From: Yuezhang Mo <Yuezhang.Mo@sony.com>
Date: Wed, 9 Nov 2022 13:50:22 +0800
Subject: exfat: support dynamic allocate bh for exfat_entry_set_cache

In special cases, a file or a directory may occupied more than 19
directory entries, pre-allocating 3 bh is not enough. Such as
  - Support vendor secondary directory entry in the future.
  - Since file directory entry is damaged, the SecondaryCount
    field is bigger than 18.

So this commit supports dynamic allocation of bh.

Signed-off-by: Yuezhang Mo <Yuezhang.Mo@sony.com>
Reviewed-by: Andy Wu <Andy.Wu@sony.com>
Reviewed-by: Aoyama Wataru <wataru.aoyama@sony.com>
Reviewed-by: Sungjong Seo <sj1557.seo@samsung.com>
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
---
 fs/exfat/dir.c      | 15 +++++++++++++++
 fs/exfat/exfat_fs.h |  5 ++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index 30d0ac43b66c..03e9c9e3966e 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -615,6 +615,10 @@ int exfat_free_dentry_set(struct exfat_entry_set_cache *es, int sync)
 			bforget(es->bh[i]);
 		else
 			brelse(es->bh[i]);
+
+	if (IS_DYNAMIC_ES(es))
+		kfree(es->bh);
+
 	kfree(es);
 	return err;
 }
@@ -847,6 +851,7 @@ struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb,
 	/* byte offset in sector */
 	off = EXFAT_BLK_OFFSET(byte_offset, sb);
 	es->start_off = off;
+	es->bh = es->__bh;
 
 	/* sector offset in cluster */
 	sec = EXFAT_B_TO_BLK(byte_offset, sb);
@@ -866,6 +871,16 @@ struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb,
 	es->num_entries = num_entries;
 
 	num_bh = EXFAT_B_TO_BLK_ROUND_UP(off + num_entries * DENTRY_SIZE, sb);
+	if (num_bh > ARRAY_SIZE(es->__bh)) {
+		es->bh = kmalloc_array(num_bh, sizeof(*es->bh), GFP_KERNEL);
+		if (!es->bh) {
+			brelse(bh);
+			kfree(es);
+			return NULL;
+		}
+		es->bh[0] = bh;
+	}
+
 	for (i = 1; i < num_bh; i++) {
 		/* get the next sector */
 		if (exfat_is_last_sector_in_cluster(sbi, sec)) {
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index af55018ff22e..82395ae80dba 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -185,11 +185,14 @@ struct exfat_entry_set_cache {
 	struct super_block *sb;
 	unsigned int start_off;
 	int num_bh;
-	struct buffer_head *bh[DIR_CACHE_SIZE];
+	struct buffer_head *__bh[DIR_CACHE_SIZE];
+	struct buffer_head **bh;
 	unsigned int num_entries;
 	bool modified;
 };
 
+#define IS_DYNAMIC_ES(es)	((es)->__bh != (es)->bh)
+
 struct exfat_dir_entry {
 	struct exfat_chain dir;
 	int entry;
-- 
cgit 


From 20914ff6dd56dd6b548bf5dd90bff09ef89999e4 Mon Sep 17 00:00:00 2001
From: Yuezhang Mo <Yuezhang.Mo@sony.com>
Date: Thu, 17 Nov 2022 11:37:13 +0800
Subject: exfat: move exfat_entry_set_cache from heap to stack

The size of struct exfat_entry_set_cache is only 56 bytes on
64-bit system, and allocating from stack is more efficient than
allocating from heap.

Signed-off-by: Yuezhang Mo <Yuezhang.Mo@sony.com>
Reviewed-by: Andy Wu <Andy.Wu@sony.com>
Reviewed-by: Aoyama Wataru <wataru.aoyama@sony.com>
Reviewed-by: Sungjong Seo <sj1557.seo@samsung.com>
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
---
 fs/exfat/dir.c      | 35 +++++++++++++++--------------------
 fs/exfat/exfat_fs.h |  5 +++--
 fs/exfat/inode.c    | 13 ++++++-------
 fs/exfat/namei.c    | 11 +++++------
 4 files changed, 29 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index 03e9c9e3966e..a3fb609dd129 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -33,10 +33,9 @@ static void exfat_get_uniname_from_ext_entry(struct super_block *sb,
 		struct exfat_chain *p_dir, int entry, unsigned short *uniname)
 {
 	int i;
-	struct exfat_entry_set_cache *es;
+	struct exfat_entry_set_cache es;
 
-	es = exfat_get_dentry_set(sb, p_dir, entry, ES_ALL_ENTRIES);
-	if (!es)
+	if (exfat_get_dentry_set(&es, sb, p_dir, entry, ES_ALL_ENTRIES))
 		return;
 
 	/*
@@ -45,8 +44,8 @@ static void exfat_get_uniname_from_ext_entry(struct super_block *sb,
 	 * Third entry  : first file-name entry
 	 * So, the index of first file-name dentry should start from 2.
 	 */
-	for (i = 2; i < es->num_entries; i++) {
-		struct exfat_dentry *ep = exfat_get_dentry_cached(es, i);
+	for (i = 2; i < es.num_entries; i++) {
+		struct exfat_dentry *ep = exfat_get_dentry_cached(&es, i);
 
 		/* end of name entry */
 		if (exfat_get_entry_type(ep) != TYPE_EXTEND)
@@ -56,7 +55,7 @@ static void exfat_get_uniname_from_ext_entry(struct super_block *sb,
 		uniname += EXFAT_FILE_NAME_LEN;
 	}
 
-	exfat_free_dentry_set(es, false);
+	exfat_free_dentry_set(&es, false);
 }
 
 /* read a directory entry from the opened directory */
@@ -619,7 +618,6 @@ int exfat_free_dentry_set(struct exfat_entry_set_cache *es, int sync)
 	if (IS_DYNAMIC_ES(es))
 		kfree(es->bh);
 
-	kfree(es);
 	return err;
 }
 
@@ -816,14 +814,14 @@ struct exfat_dentry *exfat_get_dentry_cached(
  *   pointer of entry set on success,
  *   NULL on failure.
  */
-struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb,
-		struct exfat_chain *p_dir, int entry, unsigned int type)
+int exfat_get_dentry_set(struct exfat_entry_set_cache *es,
+		struct super_block *sb, struct exfat_chain *p_dir, int entry,
+		unsigned int type)
 {
 	int ret, i, num_bh;
 	unsigned int off, byte_offset, clu = 0;
 	sector_t sec;
 	struct exfat_sb_info *sbi = EXFAT_SB(sb);
-	struct exfat_entry_set_cache *es;
 	struct exfat_dentry *ep;
 	int num_entries;
 	enum exfat_validate_dentry_mode mode = ES_MODE_STARTED;
@@ -831,17 +829,15 @@ struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb,
 
 	if (p_dir->dir == DIR_DELETED) {
 		exfat_err(sb, "access to deleted dentry");
-		return NULL;
+		return -EIO;
 	}
 
 	byte_offset = EXFAT_DEN_TO_B(entry);
 	ret = exfat_walk_fat_chain(sb, p_dir, byte_offset, &clu);
 	if (ret)
-		return NULL;
+		return ret;
 
-	es = kzalloc(sizeof(*es), GFP_KERNEL);
-	if (!es)
-		return NULL;
+	memset(es, 0, sizeof(*es));
 	es->sb = sb;
 	es->modified = false;
 
@@ -859,7 +855,7 @@ struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb,
 
 	bh = sb_bread(sb, sec);
 	if (!bh)
-		goto free_es;
+		return -EIO;
 	es->bh[es->num_bh++] = bh;
 
 	ep = exfat_get_dentry_cached(es, 0);
@@ -875,8 +871,7 @@ struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb,
 		es->bh = kmalloc_array(num_bh, sizeof(*es->bh), GFP_KERNEL);
 		if (!es->bh) {
 			brelse(bh);
-			kfree(es);
-			return NULL;
+			return -ENOMEM;
 		}
 		es->bh[0] = bh;
 	}
@@ -905,11 +900,11 @@ struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb,
 		if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode))
 			goto free_es;
 	}
-	return es;
+	return 0;
 
 free_es:
 	exfat_free_dentry_set(es, false);
-	return NULL;
+	return -EIO;
 }
 
 static inline void exfat_reset_empty_hint(struct exfat_hint_femp *hint_femp)
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index 82395ae80dba..2ffe5792b1a9 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -490,8 +490,9 @@ struct exfat_dentry *exfat_get_dentry(struct super_block *sb,
 		struct exfat_chain *p_dir, int entry, struct buffer_head **bh);
 struct exfat_dentry *exfat_get_dentry_cached(struct exfat_entry_set_cache *es,
 		int num);
-struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb,
-		struct exfat_chain *p_dir, int entry, unsigned int type);
+int exfat_get_dentry_set(struct exfat_entry_set_cache *es,
+		struct super_block *sb, struct exfat_chain *p_dir, int entry,
+		unsigned int type);
 int exfat_free_dentry_set(struct exfat_entry_set_cache *es, int sync);
 int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir);
 
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index 5590a1e83126..cdcf037a304f 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -21,7 +21,7 @@ int __exfat_write_inode(struct inode *inode, int sync)
 {
 	unsigned long long on_disk_size;
 	struct exfat_dentry *ep, *ep2;
-	struct exfat_entry_set_cache *es = NULL;
+	struct exfat_entry_set_cache es;
 	struct super_block *sb = inode->i_sb;
 	struct exfat_sb_info *sbi = EXFAT_SB(sb);
 	struct exfat_inode_info *ei = EXFAT_I(inode);
@@ -42,11 +42,10 @@ int __exfat_write_inode(struct inode *inode, int sync)
 	exfat_set_volume_dirty(sb);
 
 	/* get the directory entry of given file or directory */
-	es = exfat_get_dentry_set(sb, &(ei->dir), ei->entry, ES_ALL_ENTRIES);
-	if (!es)
+	if (exfat_get_dentry_set(&es, sb, &(ei->dir), ei->entry, ES_ALL_ENTRIES))
 		return -EIO;
-	ep = exfat_get_dentry_cached(es, 0);
-	ep2 = exfat_get_dentry_cached(es, 1);
+	ep = exfat_get_dentry_cached(&es, 0);
+	ep2 = exfat_get_dentry_cached(&es, 1);
 
 	ep->dentry.file.attr = cpu_to_le16(exfat_make_attr(inode));
 
@@ -83,8 +82,8 @@ int __exfat_write_inode(struct inode *inode, int sync)
 		ep2->dentry.stream.start_clu = EXFAT_FREE_CLUSTER;
 	}
 
-	exfat_update_dir_chksum_with_entry_set(es);
-	return exfat_free_dentry_set(es, sync);
+	exfat_update_dir_chksum_with_entry_set(&es);
+	return exfat_free_dentry_set(&es, sync);
 }
 
 int exfat_write_inode(struct inode *inode, struct writeback_control *wbc)
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index 99e00a36c029..8d72527dfb78 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -604,7 +604,7 @@ static int exfat_find(struct inode *dir, struct qstr *qname,
 	struct exfat_sb_info *sbi = EXFAT_SB(sb);
 	struct exfat_inode_info *ei = EXFAT_I(dir);
 	struct exfat_dentry *ep, *ep2;
-	struct exfat_entry_set_cache *es;
+	struct exfat_entry_set_cache es;
 	/* for optimized dir & entry to prevent long traverse of cluster chain */
 	struct exfat_hint hint_opt;
 
@@ -644,11 +644,10 @@ static int exfat_find(struct inode *dir, struct qstr *qname,
 	if (cdir.flags & ALLOC_NO_FAT_CHAIN)
 		cdir.size -= dentry / sbi->dentries_per_clu;
 	dentry = hint_opt.eidx;
-	es = exfat_get_dentry_set(sb, &cdir, dentry, ES_2_ENTRIES);
-	if (!es)
+	if (exfat_get_dentry_set(&es, sb, &cdir, dentry, ES_2_ENTRIES))
 		return -EIO;
-	ep = exfat_get_dentry_cached(es, 0);
-	ep2 = exfat_get_dentry_cached(es, 1);
+	ep = exfat_get_dentry_cached(&es, 0);
+	ep2 = exfat_get_dentry_cached(&es, 1);
 
 	info->type = exfat_get_entry_type(ep);
 	info->attr = le16_to_cpu(ep->dentry.file.attr);
@@ -677,7 +676,7 @@ static int exfat_find(struct inode *dir, struct qstr *qname,
 			     ep->dentry.file.access_time,
 			     ep->dentry.file.access_date,
 			     0);
-	exfat_free_dentry_set(es, false);
+	exfat_free_dentry_set(&es, false);
 
 	if (ei->start_clu == EXFAT_FREE_CLUSTER) {
 		exfat_fs_error(sb,
-- 
cgit 


From 3b9681acb0ef739343d8cfd35e054aab9597f1dc Mon Sep 17 00:00:00 2001
From: Yuezhang Mo <Yuezhang.Mo@sony.com>
Date: Thu, 17 Mar 2022 18:12:40 +0800
Subject: exfat: rename exfat_free_dentry_set() to exfat_put_dentry_set()

Since struct exfat_entry_set_cache is allocated from stack,
no need to free, so rename exfat_free_dentry_set() to
exfat_put_dentry_set(). After renaming, the new function pair
is exfat_get_dentry_set()/exfat_put_dentry_set().

Signed-off-by: Yuezhang Mo <Yuezhang.Mo@sony.com>
Reviewed-by: Andy Wu <Andy.Wu@sony.com>
Reviewed-by: Aoyama Wataru <wataru.aoyama@sony.com>
Reviewed-by: Sungjong Seo <sj1557.seo@samsung.com>
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
---
 fs/exfat/dir.c      | 16 ++++++++--------
 fs/exfat/exfat_fs.h |  2 +-
 fs/exfat/inode.c    |  2 +-
 fs/exfat/namei.c    |  2 +-
 4 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index a3fb609dd129..a9a0b3e46af2 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -55,7 +55,7 @@ static void exfat_get_uniname_from_ext_entry(struct super_block *sb,
 		uniname += EXFAT_FILE_NAME_LEN;
 	}
 
-	exfat_free_dentry_set(&es, false);
+	exfat_put_dentry_set(&es, false);
 }
 
 /* read a directory entry from the opened directory */
@@ -602,7 +602,7 @@ void exfat_update_dir_chksum_with_entry_set(struct exfat_entry_set_cache *es)
 	es->modified = true;
 }
 
-int exfat_free_dentry_set(struct exfat_entry_set_cache *es, int sync)
+int exfat_put_dentry_set(struct exfat_entry_set_cache *es, int sync)
 {
 	int i, err = 0;
 
@@ -860,7 +860,7 @@ int exfat_get_dentry_set(struct exfat_entry_set_cache *es,
 
 	ep = exfat_get_dentry_cached(es, 0);
 	if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode))
-		goto free_es;
+		goto put_es;
 
 	num_entries = type == ES_ALL_ENTRIES ?
 		ep->dentry.file.num_ext + 1 : type;
@@ -882,7 +882,7 @@ int exfat_get_dentry_set(struct exfat_entry_set_cache *es,
 			if (p_dir->flags == ALLOC_NO_FAT_CHAIN)
 				clu++;
 			else if (exfat_get_next_cluster(sb, &clu))
-				goto free_es;
+				goto put_es;
 			sec = exfat_cluster_to_sector(sbi, clu);
 		} else {
 			sec++;
@@ -890,7 +890,7 @@ int exfat_get_dentry_set(struct exfat_entry_set_cache *es,
 
 		bh = sb_bread(sb, sec);
 		if (!bh)
-			goto free_es;
+			goto put_es;
 		es->bh[es->num_bh++] = bh;
 	}
 
@@ -898,12 +898,12 @@ int exfat_get_dentry_set(struct exfat_entry_set_cache *es,
 	for (i = 1; i < num_entries; i++) {
 		ep = exfat_get_dentry_cached(es, i);
 		if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode))
-			goto free_es;
+			goto put_es;
 	}
 	return 0;
 
-free_es:
-	exfat_free_dentry_set(es, false);
+put_es:
+	exfat_put_dentry_set(es, false);
 	return -EIO;
 }
 
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index 2ffe5792b1a9..324acc57d029 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -493,7 +493,7 @@ struct exfat_dentry *exfat_get_dentry_cached(struct exfat_entry_set_cache *es,
 int exfat_get_dentry_set(struct exfat_entry_set_cache *es,
 		struct super_block *sb, struct exfat_chain *p_dir, int entry,
 		unsigned int type);
-int exfat_free_dentry_set(struct exfat_entry_set_cache *es, int sync);
+int exfat_put_dentry_set(struct exfat_entry_set_cache *es, int sync);
 int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir);
 
 /* inode.c */
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index cdcf037a304f..a84eae72556d 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -83,7 +83,7 @@ int __exfat_write_inode(struct inode *inode, int sync)
 	}
 
 	exfat_update_dir_chksum_with_entry_set(&es);
-	return exfat_free_dentry_set(&es, sync);
+	return exfat_put_dentry_set(&es, sync);
 }
 
 int exfat_write_inode(struct inode *inode, struct writeback_control *wbc)
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index 8d72527dfb78..57510d7f58cf 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -676,7 +676,7 @@ static int exfat_find(struct inode *dir, struct qstr *qname,
 			     ep->dentry.file.access_time,
 			     ep->dentry.file.access_date,
 			     0);
-	exfat_free_dentry_set(&es, false);
+	exfat_put_dentry_set(&es, false);
 
 	if (ei->start_clu == EXFAT_FREE_CLUSTER) {
 		exfat_fs_error(sb,
-- 
cgit 


From f3fe3954c09f97d8227d9d2edc807796a8b228ab Mon Sep 17 00:00:00 2001
From: Yuezhang Mo <Yuezhang.Mo@sony.com>
Date: Thu, 17 Mar 2022 19:39:20 +0800
Subject: exfat: replace magic numbers with Macros

Code refinement, no functional changes.

Signed-off-by: Yuezhang Mo <Yuezhang.Mo@sony.com>
Reviewed-by: Andy Wu <Andy.Wu@sony.com>
Reviewed-by: Aoyama Wataru <wataru.aoyama@sony.com>
Reviewed-by: Sungjong Seo <sj1557.seo@samsung.com>
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
---
 fs/exfat/dir.c   | 12 ++++++------
 fs/exfat/inode.c |  4 ++--
 fs/exfat/namei.c |  4 ++--
 3 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index a9a0b3e46af2..c05493fc9124 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -44,7 +44,7 @@ static void exfat_get_uniname_from_ext_entry(struct super_block *sb,
 	 * Third entry  : first file-name entry
 	 * So, the index of first file-name dentry should start from 2.
 	 */
-	for (i = 2; i < es.num_entries; i++) {
+	for (i = ES_IDX_FIRST_FILENAME; i < es.num_entries; i++) {
 		struct exfat_dentry *ep = exfat_get_dentry_cached(&es, i);
 
 		/* end of name entry */
@@ -336,7 +336,7 @@ int exfat_calc_num_entries(struct exfat_uni_name *p_uniname)
 		return -EINVAL;
 
 	/* 1 file entry + 1 stream entry + name entries */
-	return ((len - 1) / EXFAT_FILE_NAME_LEN + 3);
+	return ES_ENTRY_NUM(len);
 }
 
 unsigned int exfat_get_entry_type(struct exfat_dentry *ep)
@@ -591,13 +591,13 @@ void exfat_update_dir_chksum_with_entry_set(struct exfat_entry_set_cache *es)
 	unsigned short chksum = 0;
 	struct exfat_dentry *ep;
 
-	for (i = 0; i < es->num_entries; i++) {
+	for (i = ES_IDX_FILE; i < es->num_entries; i++) {
 		ep = exfat_get_dentry_cached(es, i);
 		chksum = exfat_calc_chksum16(ep, DENTRY_SIZE, chksum,
 					     chksum_type);
 		chksum_type = CS_DEFAULT;
 	}
-	ep = exfat_get_dentry_cached(es, 0);
+	ep = exfat_get_dentry_cached(es, ES_IDX_FILE);
 	ep->dentry.file.checksum = cpu_to_le16(chksum);
 	es->modified = true;
 }
@@ -858,7 +858,7 @@ int exfat_get_dentry_set(struct exfat_entry_set_cache *es,
 		return -EIO;
 	es->bh[es->num_bh++] = bh;
 
-	ep = exfat_get_dentry_cached(es, 0);
+	ep = exfat_get_dentry_cached(es, ES_IDX_FILE);
 	if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode))
 		goto put_es;
 
@@ -895,7 +895,7 @@ int exfat_get_dentry_set(struct exfat_entry_set_cache *es,
 	}
 
 	/* validate cached dentries */
-	for (i = 1; i < num_entries; i++) {
+	for (i = ES_IDX_STREAM; i < num_entries; i++) {
 		ep = exfat_get_dentry_cached(es, i);
 		if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode))
 			goto put_es;
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index a84eae72556d..dac5001bae9e 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -44,8 +44,8 @@ int __exfat_write_inode(struct inode *inode, int sync)
 	/* get the directory entry of given file or directory */
 	if (exfat_get_dentry_set(&es, sb, &(ei->dir), ei->entry, ES_ALL_ENTRIES))
 		return -EIO;
-	ep = exfat_get_dentry_cached(&es, 0);
-	ep2 = exfat_get_dentry_cached(&es, 1);
+	ep = exfat_get_dentry_cached(&es, ES_IDX_FILE);
+	ep2 = exfat_get_dentry_cached(&es, ES_IDX_STREAM);
 
 	ep->dentry.file.attr = cpu_to_le16(exfat_make_attr(inode));
 
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index 57510d7f58cf..01e4e8c60bbe 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -646,8 +646,8 @@ static int exfat_find(struct inode *dir, struct qstr *qname,
 	dentry = hint_opt.eidx;
 	if (exfat_get_dentry_set(&es, sb, &cdir, dentry, ES_2_ENTRIES))
 		return -EIO;
-	ep = exfat_get_dentry_cached(&es, 0);
-	ep2 = exfat_get_dentry_cached(&es, 1);
+	ep = exfat_get_dentry_cached(&es, ES_IDX_FILE);
+	ep2 = exfat_get_dentry_cached(&es, ES_IDX_STREAM);
 
 	info->type = exfat_get_entry_type(ep);
 	info->attr = le16_to_cpu(ep->dentry.file.attr);
-- 
cgit 


From 088f1343d9108c16fca064951d85e6de9f5cab42 Mon Sep 17 00:00:00 2001
From: Yuezhang Mo <Yuezhang.Mo@sony.com>
Date: Tue, 16 Aug 2022 16:55:06 +0800
Subject: exfat: remove call ilog2() from exfat_readdir()

There is no need to call ilog2() for the conversions between
cluster and dentry in exfat_readdir(), because these conversions
can be replaced with EXFAT_DEN_TO_CLU()/EXFAT_CLU_TO_DEN().

Code refinement, no functional changes.

Signed-off-by: Yuezhang Mo <Yuezhang.Mo@sony.com>
Reviewed-by: Andy Wu <Andy.Wu@sony.com>
Reviewed-by: Aoyama Wataru <wataru.aoyama@sony.com>
Reviewed-by: Sungjong Seo <sj1557.seo@samsung.com>
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
---
 fs/exfat/dir.c      |  9 ++++-----
 fs/exfat/exfat_fs.h | 10 ++++++++--
 2 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index c05493fc9124..397ea2d98848 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -61,7 +61,7 @@ static void exfat_get_uniname_from_ext_entry(struct super_block *sb,
 /* read a directory entry from the opened directory */
 static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_entry *dir_entry)
 {
-	int i, dentries_per_clu, dentries_per_clu_bits = 0, num_ext;
+	int i, dentries_per_clu, num_ext;
 	unsigned int type, clu_offset, max_dentries;
 	struct exfat_chain dir, clu;
 	struct exfat_uni_name uni_name;
@@ -83,11 +83,10 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent
 			EXFAT_B_TO_CLU(i_size_read(inode), sbi), ei->flags);
 
 	dentries_per_clu = sbi->dentries_per_clu;
-	dentries_per_clu_bits = ilog2(dentries_per_clu);
 	max_dentries = (unsigned int)min_t(u64, MAX_EXFAT_DENTRIES,
-					   (u64)sbi->num_clusters << dentries_per_clu_bits);
+				(u64)EXFAT_CLU_TO_DEN(sbi->num_clusters, sbi));
 
-	clu_offset = dentry >> dentries_per_clu_bits;
+	clu_offset = EXFAT_DEN_TO_CLU(dentry, sbi);
 	exfat_chain_dup(&clu, &dir);
 
 	if (clu.flags == ALLOC_NO_FAT_CHAIN) {
@@ -162,7 +161,7 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent
 			dir_entry->entry = dentry;
 			brelse(bh);
 
-			ei->hint_bmap.off = dentry >> dentries_per_clu_bits;
+			ei->hint_bmap.off = EXFAT_DEN_TO_CLU(dentry, sbi);
 			ei->hint_bmap.clu = clu.dir;
 
 			*cpos = EXFAT_DEN_TO_B(dentry + 1 + num_ext);
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index 324acc57d029..37e8af8042aa 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -101,11 +101,17 @@ enum {
 /*
  * helpers for block size to dentry size conversion.
  */
-#define EXFAT_B_TO_DEN_IDX(b, sbi)	\
-	((b) << ((sbi)->cluster_size_bits - DENTRY_SIZE_BITS))
 #define EXFAT_B_TO_DEN(b)		((b) >> DENTRY_SIZE_BITS)
 #define EXFAT_DEN_TO_B(b)		((b) << DENTRY_SIZE_BITS)
 
+/*
+ * helpers for cluster size to dentry size conversion.
+ */
+#define EXFAT_CLU_TO_DEN(clu, sbi)	\
+	((clu) << ((sbi)->cluster_size_bits - DENTRY_SIZE_BITS))
+#define EXFAT_DEN_TO_CLU(dentry, sbi)	\
+	((dentry) >> ((sbi)->cluster_size_bits - DENTRY_SIZE_BITS))
+
 /*
  * helpers for fat entry.
  */
-- 
cgit 


From 015c0d4f6b1e65857de88279f07d7ecc5e305137 Mon Sep 17 00:00:00 2001
From: Yuezhang Mo <Yuezhang.Mo@sony.com>
Date: Mon, 15 Aug 2022 10:15:16 +0800
Subject: exfat: remove unneeded codes from __exfat_rename()

The code gets the dentry, but the dentry is not used, remove the
code.

Code refinement, no functional changes.

Signed-off-by: Yuezhang Mo <Yuezhang.Mo@sony.com>
Reviewed-by: Andy Wu <Andy.Wu@sony.com>
Reviewed-by: Aoyama Wataru <wataru.aoyama@sony.com>
Reviewed-by: Sungjong Seo <sj1557.seo@samsung.com>
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
---
 fs/exfat/namei.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index 01e4e8c60bbe..347c8df45bd0 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -1175,7 +1175,7 @@ static int __exfat_rename(struct inode *old_parent_inode,
 	struct exfat_inode_info *new_ei = NULL;
 	unsigned int new_entry_type = TYPE_UNUSED;
 	int new_entry = 0;
-	struct buffer_head *old_bh, *new_bh = NULL;
+	struct buffer_head *new_bh = NULL;
 
 	/* check the validity of pointer parameters */
 	if (new_path == NULL || strlen(new_path) == 0)
@@ -1191,13 +1191,6 @@ static int __exfat_rename(struct inode *old_parent_inode,
 		EXFAT_I(old_parent_inode)->flags);
 	dentry = ei->entry;
 
-	ep = exfat_get_dentry(sb, &olddir, dentry, &old_bh);
-	if (!ep) {
-		ret = -EIO;
-		goto out;
-	}
-	brelse(old_bh);
-
 	/* check whether new dir is existing directory and empty */
 	if (new_inode) {
 		ret = -EIO;
-- 
cgit 


From 72880cb5f157514d797d5f6ab3184bbde671a18a Mon Sep 17 00:00:00 2001
From: Yuezhang Mo <Yuezhang.Mo@sony.com>
Date: Sun, 10 Apr 2022 16:12:14 +0800
Subject: exfat: remove unnecessary arguments from exfat_find_dir_entry()

This commit removes argument 'num_entries' and 'type' from
exfat_find_dir_entry().

Code refinement, no functional changes.

Signed-off-by: Yuezhang Mo <Yuezhang.Mo@sony.com>
Reviewed-by: Andy Wu <Andy.Wu@sony.com>
Reviewed-by: Aoyama Wataru <wataru.aoyama@sony.com>
Reviewed-by: Sungjong Seo <sj1557.seo@samsung.com>
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
---
 fs/exfat/dir.c      | 12 +++++++-----
 fs/exfat/exfat_fs.h |  3 +--
 fs/exfat/namei.c    | 10 ++--------
 3 files changed, 10 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index 397ea2d98848..8121a7e073bc 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -956,7 +956,7 @@ enum {
  */
 int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei,
 		struct exfat_chain *p_dir, struct exfat_uni_name *p_uniname,
-		int num_entries, unsigned int type, struct exfat_hint *hint_opt)
+		struct exfat_hint *hint_opt)
 {
 	int i, rewind = 0, dentry = 0, end_eidx = 0, num_ext = 0, len;
 	int order, step, name_len = 0;
@@ -967,6 +967,10 @@ int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei,
 	struct exfat_hint *hint_stat = &ei->hint_stat;
 	struct exfat_hint_femp candi_empty;
 	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	int num_entries = exfat_calc_num_entries(p_uniname);
+
+	if (num_entries < 0)
+		return num_entries;
 
 	dentries_per_clu = sbi->dentries_per_clu;
 
@@ -1020,10 +1024,8 @@ rewind:
 				step = DIRENT_STEP_FILE;
 				hint_opt->clu = clu.dir;
 				hint_opt->eidx = i;
-				if (type == TYPE_ALL || type == entry_type) {
-					num_ext = ep->dentry.file.num_ext;
-					step = DIRENT_STEP_STRM;
-				}
+				num_ext = ep->dentry.file.num_ext;
+				step = DIRENT_STEP_STRM;
 				brelse(bh);
 				continue;
 			}
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index 37e8af8042aa..21fec01d68ff 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -71,7 +71,6 @@ enum {
 #define TYPE_PADDING		0x0402
 #define TYPE_ACLTAB		0x0403
 #define TYPE_BENIGN_SEC		0x0800
-#define TYPE_ALL		0x0FFF
 
 #define MAX_CHARSET_SIZE	6 /* max size of multi-byte character */
 #define MAX_NAME_LENGTH		255 /* max len of file name excluding NULL */
@@ -490,7 +489,7 @@ void exfat_update_dir_chksum_with_entry_set(struct exfat_entry_set_cache *es);
 int exfat_calc_num_entries(struct exfat_uni_name *p_uniname);
 int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei,
 		struct exfat_chain *p_dir, struct exfat_uni_name *p_uniname,
-		int num_entries, unsigned int type, struct exfat_hint *hint_opt);
+		struct exfat_hint *hint_opt);
 int exfat_alloc_new_dir(struct inode *inode, struct exfat_chain *clu);
 struct exfat_dentry *exfat_get_dentry(struct super_block *sb,
 		struct exfat_chain *p_dir, int entry, struct buffer_head **bh);
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index 347c8df45bd0..5f995eba5dbb 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -597,7 +597,7 @@ unlock:
 static int exfat_find(struct inode *dir, struct qstr *qname,
 		struct exfat_dir_entry *info)
 {
-	int ret, dentry, num_entries, count;
+	int ret, dentry, count;
 	struct exfat_chain cdir;
 	struct exfat_uni_name uni_name;
 	struct super_block *sb = dir->i_sb;
@@ -616,10 +616,6 @@ static int exfat_find(struct inode *dir, struct qstr *qname,
 	if (ret)
 		return ret;
 
-	num_entries = exfat_calc_num_entries(&uni_name);
-	if (num_entries < 0)
-		return num_entries;
-
 	/* check the validation of hint_stat and initialize it if required */
 	if (ei->version != (inode_peek_iversion_raw(dir) & 0xffffffff)) {
 		ei->hint_stat.clu = cdir.dir;
@@ -629,9 +625,7 @@ static int exfat_find(struct inode *dir, struct qstr *qname,
 	}
 
 	/* search the file name for directories */
-	dentry = exfat_find_dir_entry(sb, ei, &cdir, &uni_name,
-			num_entries, TYPE_ALL, &hint_opt);
-
+	dentry = exfat_find_dir_entry(sb, ei, &cdir, &uni_name, &hint_opt);
 	if (dentry < 0)
 		return dentry; /* -error value */
 
-- 
cgit 


From e981917b3fae689e9372647a38746444205bb905 Mon Sep 17 00:00:00 2001
From: Yuezhang Mo <Yuezhang.Mo@sony.com>
Date: Thu, 17 Nov 2022 10:36:21 +0800
Subject: exfat: remove argument 'size' from exfat_truncate()

argument 'size' is not used in exfat_truncate(), remove it.

Code refinement, no functional changes.

Signed-off-by: Yuezhang Mo <Yuezhang.Mo@sony.com>
Reviewed-by: Andy Wu <Andy.Wu@sony.com>
Reviewed-by: Aoyama Wataru <wataru.aoyama@sony.com>
Reviewed-by: Sungjong Seo <sj1557.seo@samsung.com>
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
---
 fs/exfat/exfat_fs.h | 2 +-
 fs/exfat/file.c     | 4 ++--
 fs/exfat/inode.c    | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index 21fec01d68ff..ae048802f9db 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -449,7 +449,7 @@ int exfat_trim_fs(struct inode *inode, struct fstrim_range *range);
 /* file.c */
 extern const struct file_operations exfat_file_operations;
 int __exfat_truncate(struct inode *inode, loff_t new_size);
-void exfat_truncate(struct inode *inode, loff_t size);
+void exfat_truncate(struct inode *inode);
 int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 		  struct iattr *attr);
 int exfat_getattr(struct user_namespace *mnt_userns, const struct path *path,
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index 4e0793f35e8f..7c97c1df1305 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -189,7 +189,7 @@ int __exfat_truncate(struct inode *inode, loff_t new_size)
 	return 0;
 }
 
-void exfat_truncate(struct inode *inode, loff_t size)
+void exfat_truncate(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
 	struct exfat_sb_info *sbi = EXFAT_SB(sb);
@@ -310,7 +310,7 @@ int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 		 * __exfat_write_inode() is called from exfat_truncate(), inode
 		 * is already written by it, so mark_inode_dirty() is unneeded.
 		 */
-		exfat_truncate(inode, attr->ia_size);
+		exfat_truncate(inode);
 		up_write(&EXFAT_I(inode)->truncate_lock);
 	} else
 		mark_inode_dirty(inode);
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index dac5001bae9e..0d147f8a1f7c 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -362,7 +362,7 @@ static void exfat_write_failed(struct address_space *mapping, loff_t to)
 	if (to > i_size_read(inode)) {
 		truncate_pagecache(inode, i_size_read(inode));
 		inode->i_mtime = inode->i_ctime = current_time(inode);
-		exfat_truncate(inode, EXFAT_I(inode)->i_size_aligned);
+		exfat_truncate(inode);
 	}
 }
 
-- 
cgit 


From f7cde96710a4362dca199458d3de04f631178453 Mon Sep 17 00:00:00 2001
From: Yuezhang Mo <Yuezhang.Mo@sony.com>
Date: Mon, 28 Mar 2022 16:37:58 +0800
Subject: exfat: remove i_size_write() from __exfat_truncate()

The file/directory size is updated into inode by i_size_write()
before __exfat_truncate() is called, so it is redundant to
re-update by i_size_write() in __exfat_truncate().

Code refinement, no functional changes.

Signed-off-by: Yuezhang Mo <Yuezhang.Mo@sony.com>
Reviewed-by: Andy Wu <Andy.Wu@sony.com>
Reviewed-by: Aoyama Wataru <wataru.aoyama@sony.com>
Reviewed-by: Sungjong Seo <sj1557.seo@samsung.com>
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
---
 fs/exfat/exfat_fs.h | 2 +-
 fs/exfat/file.c     | 8 +++-----
 fs/exfat/inode.c    | 2 +-
 3 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index ae048802f9db..a1e7feb22079 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -448,7 +448,7 @@ int exfat_trim_fs(struct inode *inode, struct fstrim_range *range);
 
 /* file.c */
 extern const struct file_operations exfat_file_operations;
-int __exfat_truncate(struct inode *inode, loff_t new_size);
+int __exfat_truncate(struct inode *inode);
 void exfat_truncate(struct inode *inode);
 int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 		  struct iattr *attr);
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index 7c97c1df1305..f5b29072775d 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -93,7 +93,7 @@ static int exfat_sanitize_mode(const struct exfat_sb_info *sbi,
 }
 
 /* resize the file length */
-int __exfat_truncate(struct inode *inode, loff_t new_size)
+int __exfat_truncate(struct inode *inode)
 {
 	unsigned int num_clusters_new, num_clusters_phys;
 	unsigned int last_clu = EXFAT_FREE_CLUSTER;
@@ -113,7 +113,7 @@ int __exfat_truncate(struct inode *inode, loff_t new_size)
 
 	exfat_chain_set(&clu, ei->start_clu, num_clusters_phys, ei->flags);
 
-	if (new_size > 0) {
+	if (i_size_read(inode) > 0) {
 		/*
 		 * Truncate FAT chain num_clusters after the first cluster
 		 * num_clusters = min(new, phys);
@@ -143,8 +143,6 @@ int __exfat_truncate(struct inode *inode, loff_t new_size)
 		ei->start_clu = EXFAT_EOF_CLUSTER;
 	}
 
-	i_size_write(inode, new_size);
-
 	if (ei->type == TYPE_FILE)
 		ei->attr |= ATTR_ARCHIVE;
 
@@ -207,7 +205,7 @@ void exfat_truncate(struct inode *inode)
 		goto write_size;
 	}
 
-	err = __exfat_truncate(inode, i_size_read(inode));
+	err = __exfat_truncate(inode);
 	if (err)
 		goto write_size;
 
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index 0d147f8a1f7c..95adc4b2e436 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -626,7 +626,7 @@ void exfat_evict_inode(struct inode *inode)
 	if (!inode->i_nlink) {
 		i_size_write(inode, 0);
 		mutex_lock(&EXFAT_SB(inode->i_sb)->s_lock);
-		__exfat_truncate(inode, 0);
+		__exfat_truncate(inode);
 		mutex_unlock(&EXFAT_SB(inode->i_sb)->s_lock);
 	}
 
-- 
cgit 


From 3720dd6dcac38d03424d6ba38107f39af5318bcf Mon Sep 17 00:00:00 2001
From: "Vishal Moola (Oracle)" <vishal.moola@gmail.com>
Date: Tue, 1 Nov 2022 10:53:22 -0700
Subject: filemap: convert replace_page_cache_page() to
 replace_page_cache_folio()

Patch series "Removing the lru_cache_add() wrapper".

This patchset replaces all calls of lru_cache_add() with the folio
equivalent: folio_add_lru().  This is allows us to get rid of the wrapper
The series passes xfstests and the userfaultfd selftests.


This patch (of 5):

Eliminates 7 calls to compound_head().

Link: https://lkml.kernel.org/r/20221101175326.13265-1-vishal.moola@gmail.com
Link: https://lkml.kernel.org/r/20221101175326.13265-2-vishal.moola@gmail.com
Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/fuse/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index b4a6e0a1b945..26817a2db463 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -837,7 +837,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
 	if (WARN_ON(PageMlocked(oldpage)))
 		goto out_fallback_unlock;
 
-	replace_page_cache_page(oldpage, newpage);
+	replace_page_cache_folio(page_folio(oldpage), page_folio(newpage));
 
 	get_page(newpage);
 
-- 
cgit 


From 063aaad792eef49a11d7575dc9914b43c0fa3792 Mon Sep 17 00:00:00 2001
From: "Vishal Moola (Oracle)" <vishal.moola@gmail.com>
Date: Tue, 1 Nov 2022 10:53:23 -0700
Subject: fuse: convert fuse_try_move_page() to use folios

Converts the function to try to move folios instead of pages. Also
converts fuse_check_page() to fuse_get_folio() since this is its only
caller. This change removes 15 calls to compound_head().

Link: https://lkml.kernel.org/r/20221101175326.13265-3-vishal.moola@gmail.com
Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Acked-by: Miklos Szeredi <mszeredi@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/fuse/dev.c | 55 ++++++++++++++++++++++++++++---------------------------
 1 file changed, 28 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 26817a2db463..204c332cd343 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -764,11 +764,11 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
 	return ncpy;
 }
 
-static int fuse_check_page(struct page *page)
+static int fuse_check_folio(struct folio *folio)
 {
-	if (page_mapcount(page) ||
-	    page->mapping != NULL ||
-	    (page->flags & PAGE_FLAGS_CHECK_AT_PREP &
+	if (folio_mapped(folio) ||
+	    folio->mapping != NULL ||
+	    (folio->flags & PAGE_FLAGS_CHECK_AT_PREP &
 	     ~(1 << PG_locked |
 	       1 << PG_referenced |
 	       1 << PG_uptodate |
@@ -778,7 +778,7 @@ static int fuse_check_page(struct page *page)
 	       1 << PG_reclaim |
 	       1 << PG_waiters |
 	       LRU_GEN_MASK | LRU_REFS_MASK))) {
-		dump_page(page, "fuse: trying to steal weird page");
+		dump_page(&folio->page, "fuse: trying to steal weird page");
 		return 1;
 	}
 	return 0;
@@ -787,11 +787,11 @@ static int fuse_check_page(struct page *page)
 static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
 {
 	int err;
-	struct page *oldpage = *pagep;
-	struct page *newpage;
+	struct folio *oldfolio = page_folio(*pagep);
+	struct folio *newfolio;
 	struct pipe_buffer *buf = cs->pipebufs;
 
-	get_page(oldpage);
+	folio_get(oldfolio);
 	err = unlock_request(cs->req);
 	if (err)
 		goto out_put_old;
@@ -814,35 +814,36 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
 	if (!pipe_buf_try_steal(cs->pipe, buf))
 		goto out_fallback;
 
-	newpage = buf->page;
+	newfolio = page_folio(buf->page);
 
-	if (!PageUptodate(newpage))
-		SetPageUptodate(newpage);
+	if (!folio_test_uptodate(newfolio))
+		folio_mark_uptodate(newfolio);
 
-	ClearPageMappedToDisk(newpage);
+	folio_clear_mappedtodisk(newfolio);
 
-	if (fuse_check_page(newpage) != 0)
+	if (fuse_check_folio(newfolio) != 0)
 		goto out_fallback_unlock;
 
 	/*
 	 * This is a new and locked page, it shouldn't be mapped or
 	 * have any special flags on it
 	 */
-	if (WARN_ON(page_mapped(oldpage)))
+	if (WARN_ON(folio_mapped(oldfolio)))
 		goto out_fallback_unlock;
-	if (WARN_ON(page_has_private(oldpage)))
+	if (WARN_ON(folio_has_private(oldfolio)))
 		goto out_fallback_unlock;
-	if (WARN_ON(PageDirty(oldpage) || PageWriteback(oldpage)))
+	if (WARN_ON(folio_test_dirty(oldfolio) ||
+				folio_test_writeback(oldfolio)))
 		goto out_fallback_unlock;
-	if (WARN_ON(PageMlocked(oldpage)))
+	if (WARN_ON(folio_test_mlocked(oldfolio)))
 		goto out_fallback_unlock;
 
-	replace_page_cache_folio(page_folio(oldpage), page_folio(newpage));
+	replace_page_cache_folio(oldfolio, newfolio);
 
-	get_page(newpage);
+	folio_get(newfolio);
 
 	if (!(buf->flags & PIPE_BUF_FLAG_LRU))
-		lru_cache_add(newpage);
+		folio_add_lru(newfolio);
 
 	/*
 	 * Release while we have extra ref on stolen page.  Otherwise
@@ -855,28 +856,28 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
 	if (test_bit(FR_ABORTED, &cs->req->flags))
 		err = -ENOENT;
 	else
-		*pagep = newpage;
+		*pagep = &newfolio->page;
 	spin_unlock(&cs->req->waitq.lock);
 
 	if (err) {
-		unlock_page(newpage);
-		put_page(newpage);
+		folio_unlock(newfolio);
+		folio_put(newfolio);
 		goto out_put_old;
 	}
 
-	unlock_page(oldpage);
+	folio_unlock(oldfolio);
 	/* Drop ref for ap->pages[] array */
-	put_page(oldpage);
+	folio_put(oldfolio);
 	cs->len = 0;
 
 	err = 0;
 out_put_old:
 	/* Drop ref obtained in this function */
-	put_page(oldpage);
+	folio_put(oldfolio);
 	return err;
 
 out_fallback_unlock:
-	unlock_page(newpage);
+	folio_unlock(newfolio);
 out_fallback:
 	cs->pg = buf->page;
 	cs->offset = buf->offset;
-- 
cgit 


From 169004265860327182ecf92297b25b6271e81e96 Mon Sep 17 00:00:00 2001
From: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Date: Thu, 1 Dec 2022 15:28:51 +0000
Subject: fsdax: introduce page->share for fsdax in reflink mode

Patch series "fsdax,xfs: fix warning messages", v2.

Many testcases failed in dax+reflink mode with warning message in dmesg.
Such as generic/051,075,127.  The warning message is like this:
[  775.509337] ------------[ cut here ]------------
[  775.509636] WARNING: CPU: 1 PID: 16815 at fs/dax.c:386 dax_insert_entry.cold+0x2e/0x69
[  775.510151] Modules linked in: auth_rpcgss oid_registry nfsv4 algif_hash af_alg af_packet nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables nfnetlink ip6table_filter ip6_tables iptable_filter ip_tables x_tables dax_pmem nd_pmem nd_btt sch_fq_codel configfs xfs libcrc32c fuse
[  775.524288] CPU: 1 PID: 16815 Comm: fsx Kdump: loaded Tainted: G        W          6.1.0-rc4+ #164 eb34e4ee4200c7cbbb47de2b1892c5a3e027fd6d
[  775.524904] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS Arch Linux 1.16.0-3-3 04/01/2014
[  775.525460] RIP: 0010:dax_insert_entry.cold+0x2e/0x69
[  775.525797] Code: c7 c7 18 eb e0 81 48 89 4c 24 20 48 89 54 24 10 e8 73 6d ff ff 48 83 7d 18 00 48 8b 54 24 10 48 8b 4c 24 20 0f 84 e3 e9 b9 ff <0f> 0b e9 dc e9 b9 ff 48 c7 c6 a0 20 c3 81 48 c7 c7 f0 ea e0 81 48
[  775.526708] RSP: 0000:ffffc90001d57b30 EFLAGS: 00010082
[  775.527042] RAX: 000000000000002a RBX: 0000000000000000 RCX: 0000000000000042
[  775.527396] RDX: ffffea000a0f6c80 RSI: ffffffff81dfab1b RDI: 00000000ffffffff
[  775.527819] RBP: ffffea000a0f6c40 R08: 0000000000000000 R09: ffffffff820625e0
[  775.528241] R10: ffffc90001d579d8 R11: ffffffff820d2628 R12: ffff88815fc98320
[  775.528598] R13: ffffc90001d57c18 R14: 0000000000000000 R15: 0000000000000001
[  775.528997] FS:  00007f39fc75d740(0000) GS:ffff88817bc80000(0000) knlGS:0000000000000000
[  775.529474] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  775.529800] CR2: 00007f39fc772040 CR3: 0000000107eb6001 CR4: 00000000003706e0
[  775.530214] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  775.530592] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  775.531002] Call Trace:
[  775.531230]  <TASK>
[  775.531444]  dax_fault_iter+0x267/0x6c0
[  775.531719]  dax_iomap_pte_fault+0x198/0x3d0
[  775.532002]  __xfs_filemap_fault+0x24a/0x2d0 [xfs aa8d25411432b306d9554da38096f4ebb86bdfe7]
[  775.532603]  __do_fault+0x30/0x1e0
[  775.532903]  do_fault+0x314/0x6c0
[  775.533166]  __handle_mm_fault+0x646/0x1250
[  775.533480]  handle_mm_fault+0xc1/0x230
[  775.533810]  do_user_addr_fault+0x1ac/0x610
[  775.534110]  exc_page_fault+0x63/0x140
[  775.534389]  asm_exc_page_fault+0x22/0x30
[  775.534678] RIP: 0033:0x7f39fc55820a
[  775.534950] Code: 00 01 00 00 00 74 99 83 f9 c0 0f 87 7b fe ff ff c5 fe 6f 4e 20 48 29 fe 48 83 c7 3f 49 8d 0c 10 48 83 e7 c0 48 01 fe 48 29 f9 <f3> a4 c4 c1 7e 7f 00 c4 c1 7e 7f 48 20 c5 f8 77 c3 0f 1f 44 00 00
[  775.535839] RSP: 002b:00007ffc66a08118 EFLAGS: 00010202
[  775.536157] RAX: 00007f39fc772001 RBX: 0000000000042001 RCX: 00000000000063c1
[  775.536537] RDX: 0000000000006400 RSI: 00007f39fac42050 RDI: 00007f39fc772040
[  775.536919] RBP: 0000000000006400 R08: 00007f39fc772001 R09: 0000000000042000
[  775.537304] R10: 0000000000000001 R11: 0000000000000246 R12: 0000000000000001
[  775.537694] R13: 00007f39fc772000 R14: 0000000000006401 R15: 0000000000000003
[  775.538086]  </TASK>
[  775.538333] ---[ end trace 0000000000000000 ]---

This also affects dax+noreflink mode if we run the test after a
dax+reflink test.  So, the most urgent thing is solving the warning
messages.

With these fixes, most warning messages in dax_associate_entry() are gone.
But honestly, generic/388 will randomly failed with the warning.  The
case shutdown the xfs when fsstress is running, and do it for many times.
I think the reason is that dax pages in use are not able to be invalidated
in time when fs is shutdown.  The next time dax page to be associated, it
still remains the mapping value set last time.  I'll keep on solving it.

The warning message in dax_writeback_one() can also be fixed because of
the dax unshare.


This patch (of 8):

fsdax page is used not only when CoW, but also mapread.  To make the it
easily understood, use 'share' to indicate that the dax page is shared by
more than one extent.  And add helper functions to use it.

Also, the flag needs to be renamed to PAGE_MAPPING_DAX_SHARED.

[ruansy.fnst@fujitsu.com: rename several functions]
  Link: https://lkml.kernel.org/r/1669972991-246-1-git-send-email-ruansy.fnst@fujitsu.com
[ruansy.fnst@fujitsu.com: v2.2]
  Link: https://lkml.kernel.org/r/1670381359-53-1-git-send-email-ruansy.fnst@fujitsu.com
Link: https://lkml.kernel.org/r/1669908538-55-1-git-send-email-ruansy.fnst@fujitsu.com
Link: https://lkml.kernel.org/r/1669908538-55-2-git-send-email-ruansy.fnst@fujitsu.com
Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/dax.c | 38 ++++++++++++++++++++++----------------
 1 file changed, 22 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/dax.c b/fs/dax.c
index 1c6867810cbd..84fadea08705 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -334,35 +334,41 @@ static unsigned long dax_end_pfn(void *entry)
 	for (pfn = dax_to_pfn(entry); \
 			pfn < dax_end_pfn(entry); pfn++)
 
-static inline bool dax_mapping_is_cow(struct address_space *mapping)
+static inline bool dax_page_is_shared(struct page *page)
 {
-	return (unsigned long)mapping == PAGE_MAPPING_DAX_COW;
+	return page->mapping == PAGE_MAPPING_DAX_SHARED;
 }
 
 /*
- * Set the page->mapping with FS_DAX_MAPPING_COW flag, increase the refcount.
+ * Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the
+ * refcount.
  */
-static inline void dax_mapping_set_cow(struct page *page)
+static inline void dax_page_share_get(struct page *page)
 {
-	if ((uintptr_t)page->mapping != PAGE_MAPPING_DAX_COW) {
+	if (page->mapping != PAGE_MAPPING_DAX_SHARED) {
 		/*
 		 * Reset the index if the page was already mapped
 		 * regularly before.
 		 */
 		if (page->mapping)
-			page->index = 1;
-		page->mapping = (void *)PAGE_MAPPING_DAX_COW;
+			page->share = 1;
+		page->mapping = PAGE_MAPPING_DAX_SHARED;
 	}
-	page->index++;
+	page->share++;
+}
+
+static inline unsigned long dax_page_share_put(struct page *page)
+{
+	return --page->share;
 }
 
 /*
- * When it is called in dax_insert_entry(), the cow flag will indicate that
+ * When it is called in dax_insert_entry(), the shared flag will indicate that
  * whether this entry is shared by multiple files.  If so, set the page->mapping
- * FS_DAX_MAPPING_COW, and use page->index as refcount.
+ * PAGE_MAPPING_DAX_SHARED, and use page->share as refcount.
  */
 static void dax_associate_entry(void *entry, struct address_space *mapping,
-		struct vm_area_struct *vma, unsigned long address, bool cow)
+		struct vm_area_struct *vma, unsigned long address, bool shared)
 {
 	unsigned long size = dax_entry_size(entry), pfn, index;
 	int i = 0;
@@ -374,8 +380,8 @@ static void dax_associate_entry(void *entry, struct address_space *mapping,
 	for_each_mapped_pfn(entry, pfn) {
 		struct page *page = pfn_to_page(pfn);
 
-		if (cow) {
-			dax_mapping_set_cow(page);
+		if (shared) {
+			dax_page_share_get(page);
 		} else {
 			WARN_ON_ONCE(page->mapping);
 			page->mapping = mapping;
@@ -396,9 +402,9 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,
 		struct page *page = pfn_to_page(pfn);
 
 		WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
-		if (dax_mapping_is_cow(page->mapping)) {
-			/* keep the CoW flag if this page is still shared */
-			if (page->index-- > 0)
+		if (dax_page_is_shared(page)) {
+			/* keep the shared flag if this page is still shared */
+			if (dax_page_share_put(page) > 0)
 				continue;
 		} else
 			WARN_ON_ONCE(page->mapping && page->mapping != mapping);
-- 
cgit 


From f80e1668888f34c0764822e74953c997daf2ccdb Mon Sep 17 00:00:00 2001
From: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Date: Thu, 1 Dec 2022 15:28:52 +0000
Subject: fsdax: invalidate pages when CoW

CoW changes the share state of a dax page, but the share count of the page
isn't updated.  The next time access this page, it should have been a
newly accessed, but old association exists.  So, we need to clear the
share state when CoW happens, in both dax_iomap_rw() and dax_zero_iter().

Link: https://lkml.kernel.org/r/1669908538-55-3-git-send-email-ruansy.fnst@fujitsu.com
Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/dax.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/dax.c b/fs/dax.c
index 84fadea08705..c975d075e77b 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1264,6 +1264,15 @@ static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
 	if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
 		return length;
 
+	/*
+	 * invalidate the pages whose sharing state is to be changed
+	 * because of CoW.
+	 */
+	if (iomap->flags & IOMAP_F_SHARED)
+		invalidate_inode_pages2_range(iter->inode->i_mapping,
+					      pos >> PAGE_SHIFT,
+					      (pos + length - 1) >> PAGE_SHIFT);
+
 	do {
 		unsigned offset = offset_in_page(pos);
 		unsigned size = min_t(u64, PAGE_SIZE - offset, length);
@@ -1324,12 +1333,13 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
 		struct iov_iter *iter)
 {
 	const struct iomap *iomap = &iomi->iomap;
-	const struct iomap *srcmap = &iomi->srcmap;
+	const struct iomap *srcmap = iomap_iter_srcmap(iomi);
 	loff_t length = iomap_length(iomi);
 	loff_t pos = iomi->pos;
 	struct dax_device *dax_dev = iomap->dax_dev;
 	loff_t end = pos + length, done = 0;
 	bool write = iov_iter_rw(iter) == WRITE;
+	bool cow = write && iomap->flags & IOMAP_F_SHARED;
 	ssize_t ret = 0;
 	size_t xfer;
 	int id;
@@ -1356,7 +1366,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
 	 * into page tables. We have to tear down these mappings so that data
 	 * written by write(2) is visible in mmap.
 	 */
-	if (iomap->flags & IOMAP_F_NEW) {
+	if (iomap->flags & IOMAP_F_NEW || cow) {
 		invalidate_inode_pages2_range(iomi->inode->i_mapping,
 					      pos >> PAGE_SHIFT,
 					      (end - 1) >> PAGE_SHIFT);
@@ -1390,8 +1400,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
 			break;
 		}
 
-		if (write &&
-		    srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) {
+		if (cow) {
 			ret = dax_iomap_cow_copy(pos, length, PAGE_SIZE, srcmap,
 						 kaddr);
 			if (ret)
-- 
cgit 


From 708dfad2eb4169324189782edd6d3763237e0489 Mon Sep 17 00:00:00 2001
From: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Date: Thu, 1 Dec 2022 15:28:53 +0000
Subject: fsdax: zero the edges if source is HOLE or UNWRITTEN

If srcmap contains invalid data, such as HOLE and UNWRITTEN, the dest page
should be zeroed.  Otherwise, since it's a pmem, old data may remains on
the dest page, the result of CoW will be incorrect.

The function name is also not easy to understand, rename it to
"dax_iomap_copy_around()", which means it copies data around the range.

[akpm@linux-foundation.org: update dax_iomap_copy_around() kerneldoc, per Darrick]
Link: https://lkml.kernel.org/r/1669973145-318-1-git-send-email-ruansy.fnst@fujitsu.com
Link: https://lkml.kernel.org/r/1669908538-55-4-git-send-email-ruansy.fnst@fujitsu.com
Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/dax.c | 79 ++++++++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 49 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/dax.c b/fs/dax.c
index c975d075e77b..359b958eb835 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1092,7 +1092,8 @@ out:
 }
 
 /**
- * dax_iomap_cow_copy - Copy the data from source to destination before write
+ * dax_iomap_copy_around - Prepare for an unaligned write to a shared/cow page
+ * by copying the data before and after the range to be written.
  * @pos:	address to do copy from.
  * @length:	size of copy operation.
  * @align_size:	aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE)
@@ -1101,35 +1102,50 @@ out:
  *
  * This can be called from two places. Either during DAX write fault (page
  * aligned), to copy the length size data to daddr. Or, while doing normal DAX
- * write operation, dax_iomap_actor() might call this to do the copy of either
+ * write operation, dax_iomap_iter() might call this to do the copy of either
  * start or end unaligned address. In the latter case the rest of the copy of
- * aligned ranges is taken care by dax_iomap_actor() itself.
+ * aligned ranges is taken care by dax_iomap_iter() itself.
+ * If the srcmap contains invalid data, such as HOLE and UNWRITTEN, zero the
+ * area to make sure no old data remains.
  */
-static int dax_iomap_cow_copy(loff_t pos, uint64_t length, size_t align_size,
+static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size,
 		const struct iomap *srcmap, void *daddr)
 {
 	loff_t head_off = pos & (align_size - 1);
 	size_t size = ALIGN(head_off + length, align_size);
 	loff_t end = pos + length;
 	loff_t pg_end = round_up(end, align_size);
+	/* copy_all is usually in page fault case */
 	bool copy_all = head_off == 0 && end == pg_end;
+	/* zero the edges if srcmap is a HOLE or IOMAP_UNWRITTEN */
+	bool zero_edge = srcmap->flags & IOMAP_F_SHARED ||
+			 srcmap->type == IOMAP_UNWRITTEN;
 	void *saddr = 0;
 	int ret = 0;
 
-	ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL);
-	if (ret)
-		return ret;
+	if (!zero_edge) {
+		ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL);
+		if (ret)
+			return ret;
+	}
 
 	if (copy_all) {
-		ret = copy_mc_to_kernel(daddr, saddr, length);
-		return ret ? -EIO : 0;
+		if (zero_edge)
+			memset(daddr, 0, size);
+		else
+			ret = copy_mc_to_kernel(daddr, saddr, length);
+		goto out;
 	}
 
 	/* Copy the head part of the range */
 	if (head_off) {
-		ret = copy_mc_to_kernel(daddr, saddr, head_off);
-		if (ret)
-			return -EIO;
+		if (zero_edge)
+			memset(daddr, 0, head_off);
+		else {
+			ret = copy_mc_to_kernel(daddr, saddr, head_off);
+			if (ret)
+				return -EIO;
+		}
 	}
 
 	/* Copy the tail part of the range */
@@ -1137,12 +1153,19 @@ static int dax_iomap_cow_copy(loff_t pos, uint64_t length, size_t align_size,
 		loff_t tail_off = head_off + length;
 		loff_t tail_len = pg_end - end;
 
-		ret = copy_mc_to_kernel(daddr + tail_off, saddr + tail_off,
-					tail_len);
-		if (ret)
-			return -EIO;
+		if (zero_edge)
+			memset(daddr + tail_off, 0, tail_len);
+		else {
+			ret = copy_mc_to_kernel(daddr + tail_off,
+						saddr + tail_off, tail_len);
+			if (ret)
+				return -EIO;
+		}
 	}
-	return 0;
+out:
+	if (zero_edge)
+		dax_flush(srcmap->dax_dev, daddr, size);
+	return ret ? -EIO : 0;
 }
 
 /*
@@ -1241,13 +1264,10 @@ static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
 	if (ret < 0)
 		return ret;
 	memset(kaddr + offset, 0, size);
-	if (srcmap->addr != iomap->addr) {
-		ret = dax_iomap_cow_copy(pos, size, PAGE_SIZE, srcmap,
-					 kaddr);
-		if (ret < 0)
-			return ret;
-		dax_flush(iomap->dax_dev, kaddr, PAGE_SIZE);
-	} else
+	if (iomap->flags & IOMAP_F_SHARED)
+		ret = dax_iomap_copy_around(pos, size, PAGE_SIZE, srcmap,
+					    kaddr);
+	else
 		dax_flush(iomap->dax_dev, kaddr + offset, size);
 	return ret;
 }
@@ -1401,8 +1421,8 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
 		}
 
 		if (cow) {
-			ret = dax_iomap_cow_copy(pos, length, PAGE_SIZE, srcmap,
-						 kaddr);
+			ret = dax_iomap_copy_around(pos, length, PAGE_SIZE,
+						    srcmap, kaddr);
 			if (ret)
 				break;
 		}
@@ -1547,7 +1567,7 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
 		struct xa_state *xas, void **entry, bool pmd)
 {
 	const struct iomap *iomap = &iter->iomap;
-	const struct iomap *srcmap = &iter->srcmap;
+	const struct iomap *srcmap = iomap_iter_srcmap(iter);
 	size_t size = pmd ? PMD_SIZE : PAGE_SIZE;
 	loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
 	bool write = iter->flags & IOMAP_WRITE;
@@ -1578,9 +1598,8 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
 
 	*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, entry_flags);
 
-	if (write &&
-	    srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) {
-		err = dax_iomap_cow_copy(pos, size, size, srcmap, kaddr);
+	if (write && iomap->flags & IOMAP_F_SHARED) {
+		err = dax_iomap_copy_around(pos, size, size, srcmap, kaddr);
 		if (err)
 			return dax_fault_return(err);
 	}
-- 
cgit 


From c6f0b395b2110aa26a134a9a395875b1ec0a5aae Mon Sep 17 00:00:00 2001
From: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Date: Thu, 1 Dec 2022 15:28:54 +0000
Subject: fsdax,xfs: set the shared flag when file extent is shared

If a dax page is shared, mapread at different offsets can also trigger
page fault on same dax page.  So, change the flag from "cow" to "shared".
And get the shared flag from filesystem when read.

Link: https://lkml.kernel.org/r/1669908538-55-5-git-send-email-ruansy.fnst@fujitsu.com
Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/dax.c           | 19 +++++++------------
 fs/xfs/xfs_iomap.c |  2 +-
 2 files changed, 8 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/dax.c b/fs/dax.c
index 359b958eb835..fa547ce41add 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -846,12 +846,6 @@ static bool dax_fault_is_synchronous(const struct iomap_iter *iter,
 		(iter->iomap.flags & IOMAP_F_DIRTY);
 }
 
-static bool dax_fault_is_cow(const struct iomap_iter *iter)
-{
-	return (iter->flags & IOMAP_WRITE) &&
-		(iter->iomap.flags & IOMAP_F_SHARED);
-}
-
 /*
  * By this point grab_mapping_entry() has ensured that we have a locked entry
  * of the appropriate size so we don't have to worry about downgrading PMDs to
@@ -865,13 +859,14 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
 {
 	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
 	void *new_entry = dax_make_entry(pfn, flags);
-	bool dirty = !dax_fault_is_synchronous(iter, vmf->vma);
-	bool cow = dax_fault_is_cow(iter);
+	bool write = iter->flags & IOMAP_WRITE;
+	bool dirty = write && !dax_fault_is_synchronous(iter, vmf->vma);
+	bool shared = iter->iomap.flags & IOMAP_F_SHARED;
 
 	if (dirty)
 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 
-	if (cow || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
+	if (shared || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
 		unsigned long index = xas->xa_index;
 		/* we are replacing a zero page with block mapping */
 		if (dax_is_pmd_entry(entry))
@@ -883,12 +878,12 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
 
 	xas_reset(xas);
 	xas_lock_irq(xas);
-	if (cow || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
+	if (shared || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
 		void *old;
 
 		dax_disassociate_entry(entry, mapping, false);
 		dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address,
-				cow);
+				shared);
 		/*
 		 * Only swap our new entry into the page cache if the current
 		 * entry is a zero page or an empty entry.  If a normal PTE or
@@ -908,7 +903,7 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
 	if (dirty)
 		xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
 
-	if (cow)
+	if (write && shared)
 		xas_set_mark(xas, PAGECACHE_TAG_TOWRITE);
 
 	xas_unlock_irq(xas);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 07da03976ec1..881de99766ca 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1215,7 +1215,7 @@ xfs_read_iomap_begin(
 		return error;
 	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
 			       &nimaps, 0);
-	if (!error && (flags & IOMAP_REPORT))
+	if (!error && ((flags & IOMAP_REPORT) || IS_DAX(inode)))
 		error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
 	xfs_iunlock(ip, lockmode);
 
-- 
cgit 


From 0e79e3736d54bb8efbc9fb29cc3b54a132783565 Mon Sep 17 00:00:00 2001
From: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Date: Thu, 1 Dec 2022 15:31:41 +0000
Subject: fsdax: dedupe: iter two files at the same time

The iomap_iter() on a range of one file may loop more than once.  In this
case, the inner dst_iter can update its iomap but the outer src_iter
can't.  This may cause the wrong remapping in filesystem.  Let them called
at the same time.

Link: https://lkml.kernel.org/r/1669908701-93-1-git-send-email-ruansy.fnst@fujitsu.com
Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/dax.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/dax.c b/fs/dax.c
index fa547ce41add..8fb928cd9dce 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1965,15 +1965,15 @@ int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
 		.len		= len,
 		.flags		= IOMAP_DAX,
 	};
-	int ret;
+	int ret, compared = 0;
 
-	while ((ret = iomap_iter(&src_iter, ops)) > 0) {
-		while ((ret = iomap_iter(&dst_iter, ops)) > 0) {
-			dst_iter.processed = dax_range_compare_iter(&src_iter,
-						&dst_iter, len, same);
-		}
-		if (ret <= 0)
-			src_iter.processed = ret;
+	while ((ret = iomap_iter(&src_iter, ops)) > 0 &&
+	       (ret = iomap_iter(&dst_iter, ops)) > 0) {
+		compared = dax_range_compare_iter(&src_iter, &dst_iter, len,
+						  same);
+		if (compared < 0)
+			return ret;
+		src_iter.processed = dst_iter.processed = compared;
 	}
 	return ret;
 }
-- 
cgit 


From 64e6edc185da7e101e867c4732c097fedb1da08e Mon Sep 17 00:00:00 2001
From: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Date: Thu, 1 Dec 2022 15:32:10 +0000
Subject: xfs: use dax ops for zero and truncate in fsdax mode

Zero and truncate on a dax file may execute CoW.  So use dax ops which
contains end work for CoW.

Link: https://lkml.kernel.org/r/1669908730-131-1-git-send-email-ruansy.fnst@fujitsu.com
Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/xfs/xfs_iomap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 881de99766ca..d9401d0300ad 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1370,7 +1370,7 @@ xfs_zero_range(
 
 	if (IS_DAX(inode))
 		return dax_zero_range(inode, pos, len, did_zero,
-				      &xfs_direct_write_iomap_ops);
+				      &xfs_dax_write_iomap_ops);
 	return iomap_zero_range(inode, pos, len, did_zero,
 				&xfs_buffered_write_iomap_ops);
 }
@@ -1385,7 +1385,7 @@ xfs_truncate_page(
 
 	if (IS_DAX(inode))
 		return dax_truncate_page(inode, pos, did_zero,
-					&xfs_direct_write_iomap_ops);
+					&xfs_dax_write_iomap_ops);
 	return iomap_truncate_page(inode, pos, did_zero,
 				   &xfs_buffered_write_iomap_ops);
 }
-- 
cgit 


From d984648e428bf88cbd94ebe346c73632cb92fffb Mon Sep 17 00:00:00 2001
From: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Date: Thu, 1 Dec 2022 15:32:33 +0000
Subject: fsdax,xfs: port unshare to fsdax

Implement unshare in fsdax mode: copy data from srcmap to iomap.

Link: https://lkml.kernel.org/r/1669908753-169-1-git-send-email-ruansy.fnst@fujitsu.com
Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/dax.c             | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_reflink.c |  8 ++++++--
 2 files changed, 58 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dax.c b/fs/dax.c
index 8fb928cd9dce..c48a3a93ab29 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1245,6 +1245,58 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
 }
 #endif /* CONFIG_FS_DAX_PMD */
 
+static s64 dax_unshare_iter(struct iomap_iter *iter)
+{
+	struct iomap *iomap = &iter->iomap;
+	const struct iomap *srcmap = iomap_iter_srcmap(iter);
+	loff_t pos = iter->pos;
+	loff_t length = iomap_length(iter);
+	int id = 0;
+	s64 ret = 0;
+	void *daddr = NULL, *saddr = NULL;
+
+	/* don't bother with blocks that are not shared to start with */
+	if (!(iomap->flags & IOMAP_F_SHARED))
+		return length;
+	/* don't bother with holes or unwritten extents */
+	if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
+		return length;
+
+	id = dax_read_lock();
+	ret = dax_iomap_direct_access(iomap, pos, length, &daddr, NULL);
+	if (ret < 0)
+		goto out_unlock;
+
+	ret = dax_iomap_direct_access(srcmap, pos, length, &saddr, NULL);
+	if (ret < 0)
+		goto out_unlock;
+
+	ret = copy_mc_to_kernel(daddr, saddr, length);
+	if (ret)
+		ret = -EIO;
+
+out_unlock:
+	dax_read_unlock(id);
+	return ret;
+}
+
+int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
+		const struct iomap_ops *ops)
+{
+	struct iomap_iter iter = {
+		.inode		= inode,
+		.pos		= pos,
+		.len		= len,
+		.flags		= IOMAP_WRITE | IOMAP_UNSHARE | IOMAP_DAX,
+	};
+	int ret;
+
+	while ((ret = iomap_iter(&iter, ops)) > 0)
+		iter.processed = dax_unshare_iter(&iter);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dax_file_unshare);
+
 static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
 {
 	const struct iomap *iomap = &iter->iomap;
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 93bdd25680bc..fe46bce8cae6 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1693,8 +1693,12 @@ xfs_reflink_unshare(
 
 	inode_dio_wait(inode);
 
-	error = iomap_file_unshare(inode, offset, len,
-			&xfs_buffered_write_iomap_ops);
+	if (IS_DAX(inode))
+		error = dax_file_unshare(inode, offset, len,
+				&xfs_dax_write_iomap_ops);
+	else
+		error = iomap_file_unshare(inode, offset, len,
+				&xfs_buffered_write_iomap_ops);
 	if (error)
 		goto out;
 
-- 
cgit 


From 480017957d6380d3336a8e80ad90f70415bb86f7 Mon Sep 17 00:00:00 2001
From: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Date: Thu, 1 Dec 2022 15:32:53 +0000
Subject: xfs: remove restrictions for fsdax and reflink

Since the basic function for fsdax and reflink has been implemented,
remove the restrictions of them for widly test.

Link: https://lkml.kernel.org/r/1669908773-207-1-git-send-email-ruansy.fnst@fujitsu.com
Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/xfs/xfs_ioctl.c | 4 ----
 fs/xfs/xfs_iops.c  | 4 ----
 2 files changed, 8 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 1f783e979629..13f1b2add390 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1138,10 +1138,6 @@ xfs_ioctl_setattr_xflags(
 	if ((fa->fsx_xflags & FS_XFLAG_REALTIME) && xfs_is_reflink_inode(ip))
 		ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
 
-	/* Don't allow us to set DAX mode for a reflinked file for now. */
-	if ((fa->fsx_xflags & FS_XFLAG_DAX) && xfs_is_reflink_inode(ip))
-		return -EINVAL;
-
 	/* diflags2 only valid for v3 inodes. */
 	i_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
 	if (i_flags2 && !xfs_has_v3inodes(mp))
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 2e10e1c66ad6..bf0495f7a5e1 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1185,10 +1185,6 @@ xfs_inode_supports_dax(
 	if (!S_ISREG(VFS_I(ip)->i_mode))
 		return false;
 
-	/* Only supported on non-reflinked files. */
-	if (xfs_is_reflink_inode(ip))
-		return false;
-
 	/* Block size must match page size */
 	if (mp->m_sb.sb_blocksize != PAGE_SIZE)
 		return false;
-- 
cgit 


From a11774122180a782b327b0a9a5091d99c91a4db7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 2 Dec 2022 11:26:38 +0100
Subject: extfat: remove ->writepage

Patch series "start removing writepage instances v2".

The VM doesn't need or want ->writepage for writeback and is fine with
just having ->writepages as long as ->migrate_folio is implemented.

This series removes all ->writepage instances that use
block_write_full_page directly and also have a plain mpage_writepages
based ->writepages.


This patch (of 7):

->writepage is a very inefficient method to write back data, and only used
through write_cache_pages or a a fallback when no ->migrate_folio method
is present.

Set ->migrate_folio to the generic buffer_head based helper, and remove
the ->writepage implementation.

Link: https://lkml.kernel.org/r/20221202102644.770505-1-hch@lst.de
Link: https://lkml.kernel.org/r/20221202102644.770505-2-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Bob Copeland <me@bobcopeland.com>
Cc: Dave Kleikamp <shaggy@kernel.org>
Cc: Jan Kara <jack@suse.com>
Cc: Mikulas Patocka <mikulas@artax.karlin.mff.cuni.cz>
Cc: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Sungjong Seo <sj1557.seo@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/exfat/inode.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index 5590a1e83126..eac95bcd9a8a 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -345,11 +345,6 @@ static void exfat_readahead(struct readahead_control *rac)
 	mpage_readahead(rac, exfat_get_block);
 }
 
-static int exfat_writepage(struct page *page, struct writeback_control *wbc)
-{
-	return block_write_full_page(page, exfat_get_block, wbc);
-}
-
 static int exfat_writepages(struct address_space *mapping,
 		struct writeback_control *wbc)
 {
@@ -473,12 +468,12 @@ static const struct address_space_operations exfat_aops = {
 	.invalidate_folio = block_invalidate_folio,
 	.read_folio	= exfat_read_folio,
 	.readahead	= exfat_readahead,
-	.writepage	= exfat_writepage,
 	.writepages	= exfat_writepages,
 	.write_begin	= exfat_write_begin,
 	.write_end	= exfat_write_end,
 	.direct_IO	= exfat_direct_IO,
-	.bmap		= exfat_aop_bmap
+	.bmap		= exfat_aop_bmap,
+	.migrate_folio	= buffer_migrate_folio,
 };
 
 static inline unsigned long exfat_hash(loff_t i_pos)
-- 
cgit 


From ee649af0d9a60ea61a5dad99ef5d6b4aa346f0a0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 2 Dec 2022 11:26:39 +0100
Subject: fat: remove ->writepage

->writepage is a very inefficient method to write back data, and only
used through write_cache_pages or a a fallback when no ->migrate_folio
method is present.

Set ->migrate_folio to the generic buffer_head based helper, and remove
the ->writepage implementation.

Link: https://lkml.kernel.org/r/20221202102644.770505-3-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/fat/inode.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 1cbcc4608dc7..d99b8549ec8f 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -194,11 +194,6 @@ static int fat_get_block(struct inode *inode, sector_t iblock,
 	return 0;
 }
 
-static int fat_writepage(struct page *page, struct writeback_control *wbc)
-{
-	return block_write_full_page(page, fat_get_block, wbc);
-}
-
 static int fat_writepages(struct address_space *mapping,
 			  struct writeback_control *wbc)
 {
@@ -346,12 +341,12 @@ static const struct address_space_operations fat_aops = {
 	.invalidate_folio = block_invalidate_folio,
 	.read_folio	= fat_read_folio,
 	.readahead	= fat_readahead,
-	.writepage	= fat_writepage,
 	.writepages	= fat_writepages,
 	.write_begin	= fat_write_begin,
 	.write_end	= fat_write_end,
 	.direct_IO	= fat_direct_IO,
-	.bmap		= _fat_bmap
+	.bmap		= _fat_bmap,
+	.migrate_folio	= buffer_migrate_folio,
 };
 
 /*
-- 
cgit 


From ba195d9f14829690b8e4f67549960d83169a314e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 2 Dec 2022 11:26:40 +0100
Subject: hfs: remove ->writepage

->writepage is a very inefficient method to write back data, and only
used through write_cache_pages or a a fallback when no ->migrate_folio
method is present.

Set ->migrate_folio to the generic buffer_head based helper, and stop
wiring up ->writepage for hfs_aops.

Link: https://lkml.kernel.org/r/20221202102644.770505-4-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/hfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index c4526f16355d..16466a5e88b4 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -173,12 +173,12 @@ const struct address_space_operations hfs_aops = {
 	.dirty_folio	= block_dirty_folio,
 	.invalidate_folio = block_invalidate_folio,
 	.read_folio	= hfs_read_folio,
-	.writepage	= hfs_writepage,
 	.write_begin	= hfs_write_begin,
 	.write_end	= generic_write_end,
 	.bmap		= hfs_bmap,
 	.direct_IO	= hfs_direct_IO,
 	.writepages	= hfs_writepages,
+	.migrate_folio	= buffer_migrate_folio,
 };
 
 /*
-- 
cgit 


From 12f9b9a73dc603e658bf24eed2777cecdaf4103e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 2 Dec 2022 11:26:41 +0100
Subject: hfsplus: remove ->writepage

->writepage is a very inefficient method to write back data, and only
used through write_cache_pages or a a fallback when no ->migrate_folio
method is present.

Set ->migrate_folio to the generic buffer_head based helper, and stop
wiring up ->writepage for hfsplus_aops.

Link: https://lkml.kernel.org/r/20221202102644.770505-5-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/hfsplus/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index aeab83ed1c9c..d6572ad2407a 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -170,12 +170,12 @@ const struct address_space_operations hfsplus_aops = {
 	.dirty_folio	= block_dirty_folio,
 	.invalidate_folio = block_invalidate_folio,
 	.read_folio	= hfsplus_read_folio,
-	.writepage	= hfsplus_writepage,
 	.write_begin	= hfsplus_write_begin,
 	.write_end	= generic_write_end,
 	.bmap		= hfsplus_bmap,
 	.direct_IO	= hfsplus_direct_IO,
 	.writepages	= hfsplus_writepages,
+	.migrate_folio	= buffer_migrate_folio,
 };
 
 const struct dentry_operations hfsplus_dentry_operations = {
-- 
cgit 


From cd2e6024260de27a523e0af6ee47a20a6b8b8aa8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 2 Dec 2022 11:26:42 +0100
Subject: hpfs: remove ->writepage

->writepage is a very inefficient method to write back data, and only
used through write_cache_pages or a a fallback when no ->migrate_folio
method is present.

Set ->migrate_folio to the generic buffer_head based helper, and remove
the ->writepage implementation.

Link: https://lkml.kernel.org/r/20221202102644.770505-6-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/hpfs/file.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index f7547a62c81f..88952d4a631e 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -163,11 +163,6 @@ static int hpfs_read_folio(struct file *file, struct folio *folio)
 	return mpage_read_folio(folio, hpfs_get_block);
 }
 
-static int hpfs_writepage(struct page *page, struct writeback_control *wbc)
-{
-	return block_write_full_page(page, hpfs_get_block, wbc);
-}
-
 static void hpfs_readahead(struct readahead_control *rac)
 {
 	mpage_readahead(rac, hpfs_get_block);
@@ -248,12 +243,12 @@ const struct address_space_operations hpfs_aops = {
 	.dirty_folio	= block_dirty_folio,
 	.invalidate_folio = block_invalidate_folio,
 	.read_folio = hpfs_read_folio,
-	.writepage = hpfs_writepage,
 	.readahead = hpfs_readahead,
 	.writepages = hpfs_writepages,
 	.write_begin = hpfs_write_begin,
 	.write_end = hpfs_write_end,
-	.bmap = _hpfs_bmap
+	.bmap = _hpfs_bmap,
+	.migrate_folio = buffer_migrate_folio,
 };
 
 const struct file_operations hpfs_file_ops =
-- 
cgit 


From 2274c3b281bb47e6980ae42fb8dc93b7a38192d5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 2 Dec 2022 11:26:43 +0100
Subject: jfs: remove ->writepage

->writepage is a very inefficient method to write back data, and only
used through write_cache_pages or a a fallback when no ->migrate_folio
method is present.

Set ->migrate_folio to the generic buffer_head based helper, and remove
the ->writepage implementation.

Link: https://lkml.kernel.org/r/20221202102644.770505-7-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Dave Kleikamp <dave.kleikamp@oracle.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/jfs/inode.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index d1ec920aa030..8ac10e396050 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -264,11 +264,6 @@ int jfs_get_block(struct inode *ip, sector_t lblock,
 	return rc;
 }
 
-static int jfs_writepage(struct page *page, struct writeback_control *wbc)
-{
-	return block_write_full_page(page, jfs_get_block, wbc);
-}
-
 static int jfs_writepages(struct address_space *mapping,
 			struct writeback_control *wbc)
 {
@@ -355,12 +350,12 @@ const struct address_space_operations jfs_aops = {
 	.invalidate_folio = block_invalidate_folio,
 	.read_folio	= jfs_read_folio,
 	.readahead	= jfs_readahead,
-	.writepage	= jfs_writepage,
 	.writepages	= jfs_writepages,
 	.write_begin	= jfs_write_begin,
 	.write_end	= jfs_write_end,
 	.bmap		= jfs_bmap,
 	.direct_IO	= jfs_direct_IO,
+	.migrate_folio	= buffer_migrate_folio,
 };
 
 /*
-- 
cgit 


From 1bda9dad5aa0199c8592bac32b91afbf8ea236ff Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 2 Dec 2022 11:26:44 +0100
Subject: omfs: remove ->writepage

->writepage is a very inefficient method to write back data, and only
used through write_cache_pages or a a fallback when no ->migrate_folio
method is present.

Set ->migrate_folio to the generic buffer_head based helper, and remove
the ->writepage implementation.

Link: https://lkml.kernel.org/r/20221202102644.770505-8-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Bob Copeland <me@bobcopeland.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/omfs/file.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index fa7fe2393ff6..3a5b4b88a583 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -294,11 +294,6 @@ static void omfs_readahead(struct readahead_control *rac)
 	mpage_readahead(rac, omfs_get_block);
 }
 
-static int omfs_writepage(struct page *page, struct writeback_control *wbc)
-{
-	return block_write_full_page(page, omfs_get_block, wbc);
-}
-
 static int
 omfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
@@ -375,10 +370,10 @@ const struct address_space_operations omfs_aops = {
 	.invalidate_folio = block_invalidate_folio,
 	.read_folio = omfs_read_folio,
 	.readahead = omfs_readahead,
-	.writepage = omfs_writepage,
 	.writepages = omfs_writepages,
 	.write_begin = omfs_write_begin,
 	.write_end = generic_write_end,
 	.bmap = omfs_bmap,
+	.migrate_folio = buffer_migrate_folio,
 };
 
-- 
cgit 


From 8614d6c5eda005ad72b37afeaae2879d7c101b18 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Mon, 5 Dec 2022 18:30:07 +0100
Subject: mm: do not show fs mm pc for VM_LOCKONFAULT pages

When VM_LOCKONFAULT was added, /proc/PID/smaps wasn't hooked up to it, so
looking at /proc/PID/smaps, it shows '??' instead of something
intelligable.  This can be reached by userspace by simply calling
`mlock2(..., MLOCK_ONFAULT);`.

Fix this by adding "lf" to denote VM_LOCKONFAULT.

Link: https://lkml.kernel.org/r/20221205173007.580210-1-Jason@zx2c4.com
Fixes: de60f5f10c58 ("mm: introduce VM_LOCKONFAULT")
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Eric B Munson <emunson@akamai.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/task_mmu.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 89338950afd3..e35a0398db63 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -674,6 +674,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
 		[ilog2(VM_RAND_READ)]	= "rr",
 		[ilog2(VM_DONTCOPY)]	= "dc",
 		[ilog2(VM_DONTEXPAND)]	= "de",
+		[ilog2(VM_LOCKONFAULT)]	= "lf",
 		[ilog2(VM_ACCOUNT)]	= "ac",
 		[ilog2(VM_NORESERVE)]	= "nr",
 		[ilog2(VM_HUGETLB)]	= "ht",
-- 
cgit 


From c9a934c7d88413a35861387a11e901554810b122 Mon Sep 17 00:00:00 2001
From: Alexey Asemov <alex@alex-at.net>
Date: Sun, 27 Nov 2022 09:46:38 +0300
Subject: ocfs2: always read both high and low parts of dinode link count

When filesystem is using indexed-dirs feature, maximum link count values
can spill over to i_links_count_hi, up to OCFS2_DX_LINK_MAX links.
ocfs2_read_links_count() checks for OCFS2_INDEXED_DIR_FL flag in dinode,
but this flag is only valid for directories so for files the check causes
high part of the link count not being read back from file dinodes
resulting in wrong link count value when file has >65535 links.

As ocfs2_set_links_count() always writes both high and low parts of link
count, the flag check on reading may be removed.

Link: https://lkml.kernel.org/r/cbfca02b-b39f-89de-e1a8-904a6c60407e@alex-at.net
Signed-off-by: Alexey Asemov <alex@alex-at.net>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Gang He <ghe@suse.com>
Cc: Jun Piao <piaojun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/ocfs2.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 740b64238312..a503c553bab2 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -560,8 +560,7 @@ static inline unsigned int ocfs2_read_links_count(struct ocfs2_dinode *di)
 	u32 nlink = le16_to_cpu(di->i_links_count);
 	u32 hi = le16_to_cpu(di->i_links_count_hi);
 
-	if (di->i_dyn_features & cpu_to_le16(OCFS2_INDEXED_DIR_FL))
-		nlink |= (hi << OCFS2_LINKS_HI_SHIFT);
+	nlink |= (hi << OCFS2_LINKS_HI_SHIFT);
 
 	return nlink;
 }
-- 
cgit 


From 8d824e69d9f3fa3121b2dda25053bae71e2460d2 Mon Sep 17 00:00:00 2001
From: ZhangPeng <zhangpeng362@huawei.com>
Date: Wed, 30 Nov 2022 06:59:59 +0000
Subject: hfs: fix OOB Read in __hfs_brec_find

Syzbot reported a OOB read bug:

==================================================================
BUG: KASAN: slab-out-of-bounds in hfs_strcmp+0x117/0x190
fs/hfs/string.c:84
Read of size 1 at addr ffff88807eb62c4e by task kworker/u4:1/11
CPU: 1 PID: 11 Comm: kworker/u4:1 Not tainted
6.1.0-rc6-syzkaller-00308-g644e9524388a #0
Workqueue: writeback wb_workfn (flush-7:0)
Call Trace:
 <TASK>
 __dump_stack lib/dump_stack.c:88 [inline]
 dump_stack_lvl+0x1b1/0x28e lib/dump_stack.c:106
 print_address_description+0x74/0x340 mm/kasan/report.c:284
 print_report+0x107/0x1f0 mm/kasan/report.c:395
 kasan_report+0xcd/0x100 mm/kasan/report.c:495
 hfs_strcmp+0x117/0x190 fs/hfs/string.c:84
 __hfs_brec_find+0x213/0x5c0 fs/hfs/bfind.c:75
 hfs_brec_find+0x276/0x520 fs/hfs/bfind.c:138
 hfs_write_inode+0x34c/0xb40 fs/hfs/inode.c:462
 write_inode fs/fs-writeback.c:1440 [inline]

If the input inode of hfs_write_inode() is incorrect:
struct inode
  struct hfs_inode_info
    struct hfs_cat_key
      struct hfs_name
        u8 len # len is greater than HFS_NAMELEN(31) which is the
maximum length of an HFS filename

OOB read occurred:
hfs_write_inode()
  hfs_brec_find()
    __hfs_brec_find()
      hfs_cat_keycmp()
        hfs_strcmp() # OOB read occurred due to len is too large

Fix this by adding a Check on len in hfs_write_inode() before calling
hfs_brec_find().

Link: https://lkml.kernel.org/r/20221130065959.2168236-1-zhangpeng362@huawei.com
Signed-off-by: ZhangPeng <zhangpeng362@huawei.com>
Reported-by: <syzbot+e836ff7133ac02be825f@syzkaller.appspotmail.com>
Cc: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Nanyong Sun <sunnanyong@huawei.com>
Cc: Viacheslav Dubeyko <slava@dubeyko.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/hfs/inode.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index c4526f16355d..a0746be3c1de 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -458,6 +458,8 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 		/* panic? */
 		return -EIO;
 
+	if (HFS_I(main_inode)->cat_key.CName.len > HFS_NAMELEN)
+		return -EIO;
 	fd.search_key->cat = HFS_I(main_inode)->cat_key;
 	if (hfs_brec_find(&fd))
 		/* panic? */
-- 
cgit 


From c53ed55cb275344086e32a7080a6b19cb183650b Mon Sep 17 00:00:00 2001
From: ZhangPeng <zhangpeng362@huawei.com>
Date: Fri, 2 Dec 2022 03:00:38 +0000
Subject: hfs: Fix OOB Write in hfs_asc2mac

Syzbot reported a OOB Write bug:

loop0: detected capacity change from 0 to 64
==================================================================
BUG: KASAN: slab-out-of-bounds in hfs_asc2mac+0x467/0x9a0
fs/hfs/trans.c:133
Write of size 1 at addr ffff88801848314e by task syz-executor391/3632

Call Trace:
 <TASK>
 __dump_stack lib/dump_stack.c:88 [inline]
 dump_stack_lvl+0x1b1/0x28e lib/dump_stack.c:106
 print_address_description+0x74/0x340 mm/kasan/report.c:284
 print_report+0x107/0x1f0 mm/kasan/report.c:395
 kasan_report+0xcd/0x100 mm/kasan/report.c:495
 hfs_asc2mac+0x467/0x9a0 fs/hfs/trans.c:133
 hfs_cat_build_key+0x92/0x170 fs/hfs/catalog.c:28
 hfs_lookup+0x1ab/0x2c0 fs/hfs/dir.c:31
 lookup_open fs/namei.c:3391 [inline]
 open_last_lookups fs/namei.c:3481 [inline]
 path_openat+0x10e6/0x2df0 fs/namei.c:3710
 do_filp_open+0x264/0x4f0 fs/namei.c:3740

If in->len is much larger than HFS_NAMELEN(31) which is the maximum
length of an HFS filename, a OOB write could occur in hfs_asc2mac(). In
that case, when the dst reaches the boundary, the srclen is still
greater than 0, which causes a OOB write.
Fix this by adding a check on dstlen in while() before writing to dst
address.

Link: https://lkml.kernel.org/r/20221202030038.1391945-1-zhangpeng362@huawei.com
Fixes: 328b92278650 ("[PATCH] hfs: NLS support")
Signed-off-by: ZhangPeng <zhangpeng362@huawei.com>
Reviewed-by: Viacheslav Dubeyko <slava@dubeyko.com>
Reported-by: <syzbot+dc3b1cf9111ab5fe98e7@syzkaller.appspotmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/hfs/trans.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/hfs/trans.c b/fs/hfs/trans.c
index 39f5e343bf4d..fdb0edb8a607 100644
--- a/fs/hfs/trans.c
+++ b/fs/hfs/trans.c
@@ -109,7 +109,7 @@ void hfs_asc2mac(struct super_block *sb, struct hfs_name *out, const struct qstr
 	if (nls_io) {
 		wchar_t ch;
 
-		while (srclen > 0) {
+		while (srclen > 0 && dstlen > 0) {
 			size = nls_io->char2uni(src, srclen, &ch);
 			if (size < 0) {
 				ch = '?';
-- 
cgit 


From 9f2b5debc07073e6dfdd774e3594d0224b991927 Mon Sep 17 00:00:00 2001
From: Aditya Garg <gargaditya08@live.com>
Date: Wed, 7 Dec 2022 03:05:40 +0000
Subject: hfsplus: fix bug causing custom uid and gid being unable to be
 assigned with mount

Despite specifying UID and GID in mount command, the specified UID and GID
were not being assigned. This patch fixes this issue.

Link: https://lkml.kernel.org/r/C0264BF5-059C-45CF-B8DA-3A3BD2C803A2@live.com
Signed-off-by: Aditya Garg <gargaditya08@live.com>
Reviewed-by: Viacheslav Dubeyko <slava@dubeyko.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/hfsplus/hfsplus_fs.h | 2 ++
 fs/hfsplus/inode.c      | 4 ++--
 fs/hfsplus/options.c    | 4 ++++
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index a5db2e3b2980..6aa919e59483 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -198,6 +198,8 @@ struct hfsplus_sb_info {
 #define HFSPLUS_SB_HFSX		3
 #define HFSPLUS_SB_CASEFOLD	4
 #define HFSPLUS_SB_NOBARRIER	5
+#define HFSPLUS_SB_UID		6
+#define HFSPLUS_SB_GID		7
 
 static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb)
 {
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index aeab83ed1c9c..b675581aa9d0 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -192,11 +192,11 @@ static void hfsplus_get_perms(struct inode *inode,
 	mode = be16_to_cpu(perms->mode);
 
 	i_uid_write(inode, be32_to_cpu(perms->owner));
-	if (!i_uid_read(inode) && !mode)
+	if ((test_bit(HFSPLUS_SB_UID, &sbi->flags)) || (!i_uid_read(inode) && !mode))
 		inode->i_uid = sbi->uid;
 
 	i_gid_write(inode, be32_to_cpu(perms->group));
-	if (!i_gid_read(inode) && !mode)
+	if ((test_bit(HFSPLUS_SB_GID, &sbi->flags)) || (!i_gid_read(inode) && !mode))
 		inode->i_gid = sbi->gid;
 
 	if (dir) {
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 047e05c57560..c94a58762ad6 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -140,6 +140,8 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
 			if (!uid_valid(sbi->uid)) {
 				pr_err("invalid uid specified\n");
 				return 0;
+			} else {
+				set_bit(HFSPLUS_SB_UID, &sbi->flags);
 			}
 			break;
 		case opt_gid:
@@ -151,6 +153,8 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
 			if (!gid_valid(sbi->gid)) {
 				pr_err("invalid gid specified\n");
 				return 0;
+			} else {
+				set_bit(HFSPLUS_SB_GID, &sbi->flags);
 			}
 			break;
 		case opt_part:
-- 
cgit 


From e4b731ccb0975fd97283e0c0d9841a89063ec31a Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Tue, 18 Oct 2022 09:03:29 +0800
Subject: ceph: remove useless session parameter for check_caps()

The session parameter makes no sense any more.

Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/addr.c  |  2 +-
 fs/ceph/caps.c  | 23 +++++++++--------------
 fs/ceph/file.c  | 17 +++++++----------
 fs/ceph/inode.c |  6 +++---
 fs/ceph/ioctl.c |  2 +-
 fs/ceph/super.h |  3 +--
 6 files changed, 22 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index dcf701b05cc1..ff6e3c279a79 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1367,7 +1367,7 @@ out:
 	folio_put(folio);
 
 	if (check_cap)
-		ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
+		ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY);
 
 	return copied;
 }
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index e54814d0c2f7..1323fa28ab01 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1898,8 +1898,7 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci)
  *  CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
  *    further delay.
  */
-void ceph_check_caps(struct ceph_inode_info *ci, int flags,
-		     struct ceph_mds_session *session)
+void ceph_check_caps(struct ceph_inode_info *ci, int flags)
 {
 	struct inode *inode = &ci->netfs.inode;
 	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
@@ -1913,15 +1912,12 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 	bool queue_invalidate = false;
 	bool tried_invalidate = false;
 	bool queue_writeback = false;
-
-	if (session)
-		ceph_get_mds_session(session);
+	struct ceph_mds_session *session = NULL;
 
 	spin_lock(&ci->i_ceph_lock);
 	if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
 		/* Don't send messages until we get async create reply */
 		spin_unlock(&ci->i_ceph_lock);
-		ceph_put_mds_session(session);
 		return;
 	}
 
@@ -2851,7 +2847,7 @@ static void check_max_size(struct inode *inode, loff_t endoff)
 		check = 1;
 	spin_unlock(&ci->i_ceph_lock);
 	if (check)
-		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY);
 }
 
 static inline int get_used_fmode(int caps)
@@ -3140,7 +3136,7 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
 	switch (mode) {
 	case PUT_CAP_REFS_SYNC:
 		if (last)
-			ceph_check_caps(ci, 0, NULL);
+			ceph_check_caps(ci, 0);
 		else if (flushsnaps)
 			ceph_flush_snaps(ci, NULL);
 		break;
@@ -3255,7 +3251,7 @@ unlock:
 	spin_unlock(&ci->i_ceph_lock);
 
 	if (last) {
-		ceph_check_caps(ci, 0, NULL);
+		ceph_check_caps(ci, 0);
 	} else if (flush_snaps) {
 		ceph_flush_snaps(ci, NULL);
 	}
@@ -3604,10 +3600,9 @@ static void handle_cap_grant(struct inode *inode,
 
 	mutex_unlock(&session->s_mutex);
 	if (check_caps == 1)
-		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL,
-				session);
+		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL);
 	else if (check_caps == 2)
-		ceph_check_caps(ci, CHECK_CAPS_NOINVAL, session);
+		ceph_check_caps(ci, CHECK_CAPS_NOINVAL);
 }
 
 /*
@@ -4333,7 +4328,7 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
 		if (inode) {
 			spin_unlock(&mdsc->cap_delay_lock);
 			dout("check_delayed_caps on %p\n", inode);
-			ceph_check_caps(ci, 0, NULL);
+			ceph_check_caps(ci, 0);
 			iput(inode);
 			spin_lock(&mdsc->cap_delay_lock);
 		}
@@ -4362,7 +4357,7 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s)
 		dout("flush_dirty_caps %llx.%llx\n", ceph_vinop(inode));
 		spin_unlock(&mdsc->cap_dirty_lock);
 		ceph_wait_on_async_create(inode);
-		ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL);
+		ceph_check_caps(ci, CHECK_CAPS_FLUSH);
 		iput(inode);
 		spin_lock(&mdsc->cap_dirty_lock);
 	}
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 04fd34557de8..4e68220bc06d 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -313,7 +313,7 @@ int ceph_renew_caps(struct inode *inode, int fmode)
 		spin_unlock(&ci->i_ceph_lock);
 		dout("renew caps %p want %s issued %s updating mds_wanted\n",
 		     inode, ceph_cap_string(wanted), ceph_cap_string(issued));
-		ceph_check_caps(ci, 0, NULL);
+		ceph_check_caps(ci, 0);
 		return 0;
 	}
 	spin_unlock(&ci->i_ceph_lock);
@@ -408,7 +408,7 @@ int ceph_open(struct inode *inode, struct file *file)
 		if ((issued & wanted) != wanted &&
 		    (mds_wanted & wanted) != wanted &&
 		    ceph_snap(inode) != CEPH_SNAPDIR)
-			ceph_check_caps(ci, 0, NULL);
+			ceph_check_caps(ci, 0);
 
 		return ceph_init_file(inode, file, fmode);
 	} else if (ceph_snap(inode) != CEPH_NOSNAP &&
@@ -1092,7 +1092,7 @@ static void ceph_aio_complete(struct inode *inode,
 		loff_t endoff = aio_req->iocb->ki_pos + aio_req->total_len;
 		if (endoff > i_size_read(inode)) {
 			if (ceph_inode_set_size(inode, endoff))
-				ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+				ceph_check_caps(ci, CHECK_CAPS_AUTHONLY);
 		}
 
 		spin_lock(&ci->i_ceph_lock);
@@ -1421,8 +1421,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 		if (write && pos > size) {
 			if (ceph_inode_set_size(inode, pos))
 				ceph_check_caps(ceph_inode(inode),
-						CHECK_CAPS_AUTHONLY,
-						NULL);
+						CHECK_CAPS_AUTHONLY);
 		}
 	}
 
@@ -1577,8 +1576,7 @@ out:
 			check_caps = ceph_inode_set_size(inode, pos);
 			if (check_caps)
 				ceph_check_caps(ceph_inode(inode),
-						CHECK_CAPS_AUTHONLY,
-						NULL);
+						CHECK_CAPS_AUTHONLY);
 		}
 
 	}
@@ -1906,7 +1904,7 @@ retry_snap:
 		if (dirty)
 			__mark_inode_dirty(inode, dirty);
 		if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
-			ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL);
+			ceph_check_caps(ci, CHECK_CAPS_FLUSH);
 	}
 
 	dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
@@ -2521,8 +2519,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
 		/* Let the MDS know about dst file size change */
 		if (ceph_inode_set_size(dst_inode, dst_off) ||
 		    ceph_quota_is_max_bytes_approaching(dst_inode, dst_off))
-			ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_FLUSH,
-					NULL);
+			ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_FLUSH);
 	}
 	/* Mark Fw dirty */
 	spin_lock(&dst_ci->i_ceph_lock);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index bad9eeb6a1a5..12173c00129f 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1909,7 +1909,7 @@ static void ceph_do_invalidate_pages(struct inode *inode)
 	mutex_unlock(&ci->i_truncate_mutex);
 out:
 	if (check)
-		ceph_check_caps(ci, 0, NULL);
+		ceph_check_caps(ci, 0);
 }
 
 /*
@@ -1969,7 +1969,7 @@ retry:
 	mutex_unlock(&ci->i_truncate_mutex);
 
 	if (wrbuffer_refs == 0)
-		ceph_check_caps(ci, 0, NULL);
+		ceph_check_caps(ci, 0);
 
 	wake_up_all(&ci->i_cap_wq);
 }
@@ -1991,7 +1991,7 @@ static void ceph_inode_work(struct work_struct *work)
 		__ceph_do_pending_vmtruncate(inode);
 
 	if (test_and_clear_bit(CEPH_I_WORK_CHECK_CAPS, &ci->i_work_mask))
-		ceph_check_caps(ci, 0, NULL);
+		ceph_check_caps(ci, 0);
 
 	if (test_and_clear_bit(CEPH_I_WORK_FLUSH_SNAPS, &ci->i_work_mask))
 		ceph_flush_snaps(ci, NULL);
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 6e061bf62ad4..deac817647eb 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -253,7 +253,7 @@ static long ceph_ioctl_lazyio(struct file *file)
 		spin_unlock(&ci->i_ceph_lock);
 		dout("ioctl_layzio: file %p marked lazy\n", file);
 
-		ceph_check_caps(ci, 0, NULL);
+		ceph_check_caps(ci, 0);
 	} else {
 		dout("ioctl_layzio: file %p already lazy\n", file);
 	}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 40630e6f691c..e8bc1d0d2614 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1200,8 +1200,7 @@ extern void ceph_remove_capsnap(struct inode *inode,
 extern void ceph_flush_snaps(struct ceph_inode_info *ci,
 			     struct ceph_mds_session **psession);
 extern bool __ceph_should_report_size(struct ceph_inode_info *ci);
-extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
-			    struct ceph_mds_session *session);
+extern void ceph_check_caps(struct ceph_inode_info *ci, int flags);
 extern unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
 extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
 extern int  ceph_drop_caps_for_unlink(struct inode *inode);
-- 
cgit 


From 68c62bee9d081cf815310b3a96e38d94fc16007d Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Mon, 17 Oct 2022 22:17:35 +0800
Subject: ceph: try to check caps immediately after async creating finishes

We should call the check_caps() again immediately after the async
creating finishes in case the MDS is waiting for caps revocation
to finish.

Link: https://tracker.ceph.com/issues/46904
Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/caps.c  | 2 ++
 fs/ceph/file.c  | 9 +++++++++
 fs/ceph/super.h | 2 ++
 3 files changed, 13 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 1323fa28ab01..4b159f97fe7b 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1916,6 +1916,8 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags)
 
 	spin_lock(&ci->i_ceph_lock);
 	if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
+		ci->i_ceph_flags |= CEPH_I_ASYNC_CHECK_CAPS;
+
 		/* Don't send messages until we get async create reply */
 		spin_unlock(&ci->i_ceph_lock);
 		return;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 4e68220bc06d..a6681b12e280 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -534,14 +534,23 @@ static void wake_async_create_waiters(struct inode *inode,
 				      struct ceph_mds_session *session)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	bool check_cap = false;
 
 	spin_lock(&ci->i_ceph_lock);
 	if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
 		ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE;
 		wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT);
+
+		if (ci->i_ceph_flags & CEPH_I_ASYNC_CHECK_CAPS) {
+			ci->i_ceph_flags &= ~CEPH_I_ASYNC_CHECK_CAPS;
+			check_cap = true;
+		}
 	}
 	ceph_kick_flushing_inode_caps(session, ci);
 	spin_unlock(&ci->i_ceph_lock);
+
+	if (check_cap)
+		ceph_check_caps(ci, CHECK_CAPS_FLUSH);
 }
 
 static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index e8bc1d0d2614..e1bd6f487226 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -593,6 +593,8 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
 #define CEPH_ASYNC_CREATE_BIT	(12)	  /* async create in flight for this */
 #define CEPH_I_ASYNC_CREATE	(1 << CEPH_ASYNC_CREATE_BIT)
 #define CEPH_I_SHUTDOWN		(1 << 13) /* inode is no longer usable */
+#define CEPH_I_ASYNC_CHECK_CAPS	(1 << 14) /* check caps immediately after async
+					     creating finishes */
 
 /*
  * Masks of ceph inode work.
-- 
cgit 


From c19204cbd65c12fdcd34fb8f5d645007238ed5cd Mon Sep 17 00:00:00 2001
From: Steve French <stfrench@microsoft.com>
Date: Thu, 8 Dec 2022 16:11:00 -0600
Subject: cifs: minor cleanup of some headers

checkpatch showed formatting problems with extra spaces,
and extra semicolon and some missing blank lines in some
cifs headers.

Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz>
Reviewed-by: Germano Percossi <germano.percossi@gmail.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/cifs/cifs_ioctl.h | 2 +-
 fs/cifs/cifsfs.h     | 4 ++--
 fs/cifs/cifsglob.h   | 7 +++++--
 3 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h
index d86d78d5bfdc..332588e77c31 100644
--- a/fs/cifs/cifs_ioctl.h
+++ b/fs/cifs/cifs_ioctl.h
@@ -108,7 +108,7 @@ struct smb3_notify_info {
 #define CIFS_IOC_NOTIFY _IOW(CIFS_IOCTL_MAGIC, 9, struct smb3_notify)
 #define CIFS_DUMP_FULL_KEY _IOWR(CIFS_IOCTL_MAGIC, 10, struct smb3_full_key_debug_info)
 #define CIFS_IOC_NOTIFY_INFO _IOWR(CIFS_IOCTL_MAGIC, 11, struct smb3_notify_info)
-#define CIFS_IOC_SHUTDOWN _IOR ('X', 125, __u32)
+#define CIFS_IOC_SHUTDOWN _IOR('X', 125, __u32)
 
 /*
  * Flags for going down operation
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 388b745a978e..00a573e0ad0e 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -105,8 +105,8 @@ extern int cifs_lock(struct file *, int, struct file_lock *);
 extern int cifs_fsync(struct file *, loff_t, loff_t, int);
 extern int cifs_strict_fsync(struct file *, loff_t, loff_t, int);
 extern int cifs_flush(struct file *, fl_owner_t id);
-extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
-extern int cifs_file_strict_mmap(struct file * , struct vm_area_struct *);
+extern int cifs_file_mmap(struct file *file, struct vm_area_struct *vma);
+extern int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma);
 extern const struct file_operations cifs_dir_ops;
 extern int cifs_dir_open(struct inode *inode, struct file *file);
 extern int cifs_readdir(struct file *file, struct dir_context *ctx);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 1420acf987f0..cd3a173e65b1 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -785,6 +785,7 @@ static inline unsigned int
 in_flight(struct TCP_Server_Info *server)
 {
 	unsigned int num;
+
 	spin_lock(&server->req_lock);
 	num = server->in_flight;
 	spin_unlock(&server->req_lock);
@@ -795,6 +796,7 @@ static inline bool
 has_credits(struct TCP_Server_Info *server, int *credits, int num_credits)
 {
 	int num;
+
 	spin_lock(&server->req_lock);
 	num = *credits;
 	spin_unlock(&server->req_lock);
@@ -1025,7 +1027,7 @@ struct cifs_ses {
 	struct TCP_Server_Info *server;	/* pointer to server info */
 	int ses_count;		/* reference counter */
 	enum ses_status_enum ses_status;  /* updates protected by cifs_tcp_ses_lock */
-	unsigned overrideSecFlg;  /* if non-zero override global sec flags */
+	unsigned int overrideSecFlg; /* if non-zero override global sec flags */
 	char *serverOS;		/* name of operating system underlying server */
 	char *serverNOS;	/* name of network operating system of server */
 	char *serverDomain;	/* security realm of server */
@@ -1381,7 +1383,7 @@ struct cifsFileInfo {
 	__u32 pid;		/* process id who opened file */
 	struct cifs_fid fid;	/* file id from remote */
 	struct list_head rlist; /* reconnect list */
-	/* BB add lock scope info here if needed */ ;
+	/* BB add lock scope info here if needed */
 	/* lock scope id (0 if none) */
 	struct dentry *dentry;
 	struct tcon_link *tlink;
@@ -1769,6 +1771,7 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
 				       int number_of_items)
 {
 	int i;
+
 	if ((number_of_items == 0) || (param == NULL))
 		return;
 	for (i = 0; i < number_of_items; i++) {
-- 
cgit 


From 9544597b5b63ff1674d60e069f93555ab924b62b Mon Sep 17 00:00:00 2001
From: Steve French <stfrench@microsoft.com>
Date: Fri, 9 Dec 2022 16:55:51 -0600
Subject: cifs: fix various whitespace errors in headers

Fix some extra spaces and a few comments that were unnecessarily split over
two lines. These were some trivial issues pointed out by checkpatch)

Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/cifs/cifspdu.h   | 50 +++++++++++++++++++++-----------------------------
 fs/cifs/cifsproto.h |  2 +-
 2 files changed, 22 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index d1abaeea974a..623caece2b10 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1429,7 +1429,7 @@ typedef struct smb_com_transaction_change_notify_req {
 	__u8 WatchTree;  /* 1 = Monitor subdirectories */
 	__u8 Reserved2;
 	__le16 ByteCount;
-/* 	__u8 Pad[3];*/
+/*	__u8 Pad[3];*/
 /*	__u8 Data[1];*/
 } __attribute__((packed)) TRANSACT_CHANGE_NOTIFY_REQ;
 
@@ -1752,8 +1752,7 @@ struct smb_com_transaction2_sfi_rsp {
 	struct smb_hdr hdr;	/* wct = 10 + SetupCount */
 	struct trans2_resp t2;
 	__u16 ByteCount;
-	__u16 Reserved2;	/* parameter word reserved -
-					present for infolevels > 100 */
+	__u16 Reserved2; /* parameter word reserved - present for infolevels > 100 */
 } __attribute__((packed));
 
 struct smb_t2_qfi_req {
@@ -1768,8 +1767,7 @@ struct smb_t2_qfi_rsp {
 	struct smb_hdr hdr;     /* wct = 10 + SetupCount */
 	struct trans2_resp t2;
 	__u16 ByteCount;
-	__u16 Reserved2;        /* parameter word reserved -
-				   present for infolevels > 100 */
+	__u16 Reserved2; /* parameter word reserved - present for infolevels > 100 */
 } __attribute__((packed));
 
 /*
@@ -2146,13 +2144,11 @@ typedef struct {
 #define CIFS_UNIX_POSIX_PATH_OPS_CAP    0x00000020 /* Allow new POSIX path based
 						      calls including posix open
 						      and posix unlink */
-#define CIFS_UNIX_LARGE_READ_CAP        0x00000040 /* support reads >128K (up
-						      to 0xFFFF00 */
+#define CIFS_UNIX_LARGE_READ_CAP        0x00000040 /* support reads >128K (up to 0xFFFF00 */
 #define CIFS_UNIX_LARGE_WRITE_CAP       0x00000080
 #define CIFS_UNIX_TRANSPORT_ENCRYPTION_CAP 0x00000100 /* can do SPNEGO crypt */
 #define CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP  0x00000200 /* must do  */
-#define CIFS_UNIX_PROXY_CAP             0x00000400 /* Proxy cap: 0xACE ioctl and
-						      QFS PROXY call */
+#define CIFS_UNIX_PROXY_CAP             0x00000400 /* Proxy cap: 0xACE ioctl and QFS PROXY call */
 #ifdef CONFIG_CIFS_POSIX
 /* presumably don't need the 0x20 POSIX_PATH_OPS_CAP since we never send
    LockingX instead of posix locking call on unix sess (and we do not expect
@@ -2368,8 +2364,7 @@ typedef struct {
 
 struct file_allocation_info {
 	__le64 AllocationSize; /* Note old Samba srvr rounds this up too much */
-} __attribute__((packed));	/* size used on disk, for level 0x103 for set,
-				   0x105 for query */
+} __packed; /* size used on disk, for level 0x103 for set, 0x105 for query */
 
 struct file_end_of_file_info {
 	__le64 FileSize;		/* offset to end of file */
@@ -2409,8 +2404,7 @@ struct cifs_posix_acl { /* access conrol list  (ACL) */
 	__le16	access_entry_count;  /* access ACL - count of entries */
 	__le16	default_entry_count; /* default ACL - count of entries */
 	struct cifs_posix_ace ace_array[];
-	/* followed by
-	struct cifs_posix_ace default_ace_arraay[] */
+	/* followed by struct cifs_posix_ace default_ace_array[] */
 } __attribute__((packed));  /* level 0x204 */
 
 /* types of access control entries already defined in posix_acl.h */
@@ -2429,17 +2423,17 @@ struct cifs_posix_acl { /* access conrol list  (ACL) */
 /* end of POSIX ACL definitions */
 
 /* POSIX Open Flags */
-#define SMB_O_RDONLY 	 0x1
-#define SMB_O_WRONLY 	0x2
-#define SMB_O_RDWR 	0x4
-#define SMB_O_CREAT 	0x10
-#define SMB_O_EXCL 	0x20
-#define SMB_O_TRUNC 	0x40
-#define SMB_O_APPEND 	0x80
-#define SMB_O_SYNC 	0x100
-#define SMB_O_DIRECTORY 0x200
-#define SMB_O_NOFOLLOW 	0x400
-#define SMB_O_DIRECT 	0x800
+#define SMB_O_RDONLY	0x1
+#define SMB_O_WRONLY	0x2
+#define SMB_O_RDWR	0x4
+#define SMB_O_CREAT	0x10
+#define SMB_O_EXCL	0x20
+#define SMB_O_TRUNC	0x40
+#define SMB_O_APPEND	0x80
+#define SMB_O_SYNC	0x100
+#define SMB_O_DIRECTORY	0x200
+#define SMB_O_NOFOLLOW	0x400
+#define SMB_O_DIRECT	0x800
 
 typedef struct {
 	__le32 OpenFlags; /* same as NT CreateX */
@@ -2716,15 +2710,13 @@ typedef struct file_xattr_info {
 	__u32 xattr_value_len;
 	char  xattr_name[];
 	/* followed by xattr_value[xattr_value_len], no pad */
-} __attribute__((packed)) FILE_XATTR_INFO; /* extended attribute info
-					      level 0x205 */
+} __packed FILE_XATTR_INFO; /* extended attribute info level 0x205 */
 
 /* flags for lsattr and chflags commands removed arein uapi/linux/fs.h */
 
 typedef struct file_chattr_info {
 	__le64	mask; /* list of all possible attribute bits */
 	__le64	mode; /* list of actual attribute bits on this inode */
-} __attribute__((packed)) FILE_CHATTR_INFO;  /* ext attributes
-						(chattr, chflags) level 0x206 */
-#endif 				/* POSIX */
+} __packed FILE_CHATTR_INFO;  /* ext attributes (chattr, chflags) level 0x206 */
+#endif				/* POSIX */
 #endif				/* _CIFSPDU_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 83e83d8beabb..f216fa269c85 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -124,7 +124,7 @@ extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *,
 			struct kvec * /* resp vec */);
 extern int SendReceiveBlockingLock(const unsigned int xid,
 			struct cifs_tcon *ptcon,
-			struct smb_hdr *in_buf ,
+			struct smb_hdr *in_buf,
 			struct smb_hdr *out_buf,
 			int *bytes_returned);
 void
-- 
cgit 


From 2bfd81043e944af0e52835ef6d9b41795af22341 Mon Sep 17 00:00:00 2001
From: Steve French <stfrench@microsoft.com>
Date: Sun, 11 Dec 2022 13:54:21 -0600
Subject: cifs: fix missing display of three mount options

Three mount options: "tcpnodelay" and "noautotune" and "noblocksend"
were not displayed when passed in on cifs/smb3 mounts (e.g. displayed
in /proc/mounts e.g.).  No change to defaults so these are not
displayed if not specified on mount.

Cc: stable@vger.kernel.org
Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/cifs/cifsfs.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 712a43161448..6094cb2ff099 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -678,9 +678,15 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
 	seq_printf(s, ",echo_interval=%lu",
 			tcon->ses->server->echo_interval / HZ);
 
-	/* Only display max_credits if it was overridden on mount */
+	/* Only display the following if overridden on mount */
 	if (tcon->ses->server->max_credits != SMB2_MAX_CREDITS_AVAILABLE)
 		seq_printf(s, ",max_credits=%u", tcon->ses->server->max_credits);
+	if (tcon->ses->server->tcp_nodelay)
+		seq_puts(s, ",tcpnodelay");
+	if (tcon->ses->server->noautotune)
+		seq_puts(s, ",noautotune");
+	if (tcon->ses->server->noblocksnd)
+		seq_puts(s, ",noblocksend");
 
 	if (tcon->snapshot_time)
 		seq_printf(s, ",snapshot=%llu", tcon->snapshot_time);
-- 
cgit 


From 9d91f8108ebfed54284332e04d2073107df18794 Mon Sep 17 00:00:00 2001
From: Steve French <stfrench@microsoft.com>
Date: Sun, 11 Dec 2022 14:44:31 -0600
Subject: cifs: print warning when conflicting soft vs. hard mount options
 specified

If the user specifies conflicting hard vs. soft mount options
(or nosoft vs. nohard) print a warning to dmesg

We were missing a warning when a user e.g. mounted with both
"hard,soft" mount options.

Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/cifs/fs_context.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c
index 45119597c765..2c92a821e028 100644
--- a/fs/cifs/fs_context.c
+++ b/fs/cifs/fs_context.c
@@ -884,16 +884,21 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
 		ctx->nodfs = 1;
 		break;
 	case Opt_hard:
-		if (result.negated)
+		if (result.negated) {
+			if (ctx->retry == 1)
+				cifs_dbg(VFS, "conflicting hard vs. soft mount options\n");
 			ctx->retry = 0;
-		else
+		} else
 			ctx->retry = 1;
 		break;
 	case Opt_soft:
 		if (result.negated)
 			ctx->retry = 1;
-		else
+		else {
+			if (ctx->retry == 1)
+				cifs_dbg(VFS, "conflicting hard vs soft mount options\n");
 			ctx->retry = 0;
+		}
 		break;
 	case Opt_mapposix:
 		if (result.negated)
-- 
cgit 


From f7f291e14dde32a07b1f0aa06921d28f875a7b54 Mon Sep 17 00:00:00 2001
From: Paulo Alcantara <pc@cjr.nz>
Date: Sun, 11 Dec 2022 18:18:55 -0300
Subject: cifs: fix oops during encryption

When running xfstests against Azure the following oops occurred on an
arm64 system

  Unable to handle kernel write to read-only memory at virtual address
  ffff0001221cf000
  Mem abort info:
    ESR = 0x9600004f
    EC = 0x25: DABT (current EL), IL = 32 bits
    SET = 0, FnV = 0
    EA = 0, S1PTW = 0
    FSC = 0x0f: level 3 permission fault
  Data abort info:
    ISV = 0, ISS = 0x0000004f
    CM = 0, WnR = 1
  swapper pgtable: 4k pages, 48-bit VAs, pgdp=00000000294f3000
  [ffff0001221cf000] pgd=18000001ffff8003, p4d=18000001ffff8003,
  pud=18000001ff82e003, pmd=18000001ff71d003, pte=00600001221cf787
  Internal error: Oops: 9600004f [#1] PREEMPT SMP
  ...
  pstate: 80000005 (Nzcv daif -PAN -UAO -TCO BTYPE=--)
  pc : __memcpy+0x40/0x230
  lr : scatterwalk_copychunks+0xe0/0x200
  sp : ffff800014e92de0
  x29: ffff800014e92de0 x28: ffff000114f9de80 x27: 0000000000000008
  x26: 0000000000000008 x25: ffff800014e92e78 x24: 0000000000000008
  x23: 0000000000000001 x22: 0000040000000000 x21: ffff000000000000
  x20: 0000000000000001 x19: ffff0001037c4488 x18: 0000000000000014
  x17: 235e1c0d6efa9661 x16: a435f9576b6edd6c x15: 0000000000000058
  x14: 0000000000000001 x13: 0000000000000008 x12: ffff000114f2e590
  x11: ffffffffffffffff x10: 0000040000000000 x9 : ffff8000105c3580
  x8 : 2e9413b10000001a x7 : 534b4410fb86b005 x6 : 534b4410fb86b005
  x5 : ffff0001221cf008 x4 : ffff0001037c4490 x3 : 0000000000000001
  x2 : 0000000000000008 x1 : ffff0001037c4488 x0 : ffff0001221cf000
  Call trace:
   __memcpy+0x40/0x230
   scatterwalk_map_and_copy+0x98/0x100
   crypto_ccm_encrypt+0x150/0x180
   crypto_aead_encrypt+0x2c/0x40
   crypt_message+0x750/0x880
   smb3_init_transform_rq+0x298/0x340
   smb_send_rqst.part.11+0xd8/0x180
   smb_send_rqst+0x3c/0x100
   compound_send_recv+0x534/0xbc0
   smb2_query_info_compound+0x32c/0x440
   smb2_set_ea+0x438/0x4c0
   cifs_xattr_set+0x5d4/0x7c0

This is because in scatterwalk_copychunks(), we attempted to write to
a buffer (@sign) that was allocated in the stack (vmalloc area) by
crypt_message() and thus accessing its remaining 8 (x2) bytes ended up
crossing a page boundary.

To simply fix it, we could just pass @sign kmalloc'd from
crypt_message() and then we're done.  Luckily, we don't seem to pass
any other vmalloc'd buffers in smb_rqst::rq_iov...

Instead, let's map the correct pages and offsets from vmalloc buffers
as well in cifs_sg_set_buf() and then avoiding such oopses.

Signed-off-by: Paulo Alcantara (SUSE) <pc@cjr.nz>
Cc: stable@vger.kernel.org
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/cifs/cifsglob.h  |  68 +++++++++++++++++++++++++
 fs/cifs/cifsproto.h |   4 +-
 fs/cifs/misc.c      |   4 +-
 fs/cifs/smb2ops.c   | 143 +++++++++++++++++++++++++---------------------------
 4 files changed, 140 insertions(+), 79 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index cd3a173e65b1..703685e2db5e 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -13,6 +13,8 @@
 #include <linux/in6.h>
 #include <linux/inet.h>
 #include <linux/slab.h>
+#include <linux/scatterlist.h>
+#include <linux/mm.h>
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
 #include <linux/utsname.h>
@@ -2140,4 +2142,70 @@ static inline void move_cifs_info_to_smb2(struct smb2_file_all_info *dst, const
 	dst->FileNameLength = src->FileNameLength;
 }
 
+static inline unsigned int cifs_get_num_sgs(const struct smb_rqst *rqst,
+					    int num_rqst,
+					    const u8 *sig)
+{
+	unsigned int len, skip;
+	unsigned int nents = 0;
+	unsigned long addr;
+	int i, j;
+
+	/* Assumes the first rqst has a transform header as the first iov.
+	 * I.e.
+	 * rqst[0].rq_iov[0]  is transform header
+	 * rqst[0].rq_iov[1+] data to be encrypted/decrypted
+	 * rqst[1+].rq_iov[0+] data to be encrypted/decrypted
+	 */
+	for (i = 0; i < num_rqst; i++) {
+		/*
+		 * The first rqst has a transform header where the
+		 * first 20 bytes are not part of the encrypted blob.
+		 */
+		for (j = 0; j < rqst[i].rq_nvec; j++) {
+			struct kvec *iov = &rqst[i].rq_iov[j];
+
+			skip = (i == 0) && (j == 0) ? 20 : 0;
+			addr = (unsigned long)iov->iov_base + skip;
+			if (unlikely(is_vmalloc_addr((void *)addr))) {
+				len = iov->iov_len - skip;
+				nents += DIV_ROUND_UP(offset_in_page(addr) + len,
+						      PAGE_SIZE);
+			} else {
+				nents++;
+			}
+		}
+		nents += rqst[i].rq_npages;
+	}
+	nents += DIV_ROUND_UP(offset_in_page(sig) + SMB2_SIGNATURE_SIZE, PAGE_SIZE);
+	return nents;
+}
+
+/* We can not use the normal sg_set_buf() as we will sometimes pass a
+ * stack object as buf.
+ */
+static inline struct scatterlist *cifs_sg_set_buf(struct scatterlist *sg,
+						  const void *buf,
+						  unsigned int buflen)
+{
+	unsigned long addr = (unsigned long)buf;
+	unsigned int off = offset_in_page(addr);
+
+	addr &= PAGE_MASK;
+	if (unlikely(is_vmalloc_addr((void *)addr))) {
+		do {
+			unsigned int len = min_t(unsigned int, buflen, PAGE_SIZE - off);
+
+			sg_set_page(sg++, vmalloc_to_page((void *)addr), len, off);
+
+			off = 0;
+			addr += PAGE_SIZE;
+			buflen -= len;
+		} while (buflen);
+	} else {
+		sg_set_page(sg++, virt_to_page(addr), buflen, off);
+	}
+	return sg;
+}
+
 #endif	/* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index f216fa269c85..9c6147ca029d 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -600,8 +600,8 @@ int setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw);
 int cifs_alloc_hash(const char *name, struct shash_desc **sdesc);
 void cifs_free_hash(struct shash_desc **sdesc);
 
-extern void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page,
-				unsigned int *len, unsigned int *offset);
+void rqst_page_get_length(const struct smb_rqst *rqst, unsigned int page,
+			  unsigned int *len, unsigned int *offset);
 struct cifs_chan *
 cifs_ses_find_chan(struct cifs_ses *ses, struct TCP_Server_Info *server);
 int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 3e68d8208cf5..1cbecd64d697 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -1136,8 +1136,8 @@ cifs_free_hash(struct shash_desc **sdesc)
  * @len: Where to store the length for this page:
  * @offset: Where to store the offset for this page
  */
-void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page,
-				unsigned int *len, unsigned int *offset)
+void rqst_page_get_length(const struct smb_rqst *rqst, unsigned int page,
+			  unsigned int *len, unsigned int *offset)
 {
 	*len = rqst->rq_pagesz;
 	*offset = (page == 0) ? rqst->rq_offset : 0;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 72b22d033ed5..6e772b31e02a 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -4204,69 +4204,82 @@ fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len,
 	memcpy(&tr_hdr->SessionId, &shdr->SessionId, 8);
 }
 
-/* We can not use the normal sg_set_buf() as we will sometimes pass a
- * stack object as buf.
- */
-static inline void smb2_sg_set_buf(struct scatterlist *sg, const void *buf,
-				   unsigned int buflen)
+static void *smb2_aead_req_alloc(struct crypto_aead *tfm, const struct smb_rqst *rqst,
+				 int num_rqst, const u8 *sig, u8 **iv,
+				 struct aead_request **req, struct scatterlist **sgl,
+				 unsigned int *num_sgs)
 {
-	void *addr;
-	/*
-	 * VMAP_STACK (at least) puts stack into the vmalloc address space
-	 */
-	if (is_vmalloc_addr(buf))
-		addr = vmalloc_to_page(buf);
-	else
-		addr = virt_to_page(buf);
-	sg_set_page(sg, addr, buflen, offset_in_page(buf));
+	unsigned int req_size = sizeof(**req) + crypto_aead_reqsize(tfm);
+	unsigned int iv_size = crypto_aead_ivsize(tfm);
+	unsigned int len;
+	u8 *p;
+
+	*num_sgs = cifs_get_num_sgs(rqst, num_rqst, sig);
+
+	len = iv_size;
+	len += crypto_aead_alignmask(tfm) & ~(crypto_tfm_ctx_alignment() - 1);
+	len = ALIGN(len, crypto_tfm_ctx_alignment());
+	len += req_size;
+	len = ALIGN(len, __alignof__(struct scatterlist));
+	len += *num_sgs * sizeof(**sgl);
+
+	p = kmalloc(len, GFP_ATOMIC);
+	if (!p)
+		return NULL;
+
+	*iv = (u8 *)PTR_ALIGN(p, crypto_aead_alignmask(tfm) + 1);
+	*req = (struct aead_request *)PTR_ALIGN(*iv + iv_size,
+						crypto_tfm_ctx_alignment());
+	*sgl = (struct scatterlist *)PTR_ALIGN((u8 *)*req + req_size,
+					       __alignof__(struct scatterlist));
+	return p;
 }
 
-/* Assumes the first rqst has a transform header as the first iov.
- * I.e.
- * rqst[0].rq_iov[0]  is transform header
- * rqst[0].rq_iov[1+] data to be encrypted/decrypted
- * rqst[1+].rq_iov[0+] data to be encrypted/decrypted
- */
-static struct scatterlist *
-init_sg(int num_rqst, struct smb_rqst *rqst, u8 *sign)
+static void *smb2_get_aead_req(struct crypto_aead *tfm, const struct smb_rqst *rqst,
+			       int num_rqst, const u8 *sig, u8 **iv,
+			       struct aead_request **req, struct scatterlist **sgl)
 {
-	unsigned int sg_len;
+	unsigned int off, len, skip;
 	struct scatterlist *sg;
-	unsigned int i;
-	unsigned int j;
-	unsigned int idx = 0;
-	int skip;
-
-	sg_len = 1;
-	for (i = 0; i < num_rqst; i++)
-		sg_len += rqst[i].rq_nvec + rqst[i].rq_npages;
+	unsigned int num_sgs;
+	unsigned long addr;
+	int i, j;
+	void *p;
 
-	sg = kmalloc_array(sg_len, sizeof(struct scatterlist), GFP_KERNEL);
-	if (!sg)
+	p = smb2_aead_req_alloc(tfm, rqst, num_rqst, sig, iv, req, sgl, &num_sgs);
+	if (!p)
 		return NULL;
 
-	sg_init_table(sg, sg_len);
+	sg_init_table(*sgl, num_sgs);
+	sg = *sgl;
+
+	/* Assumes the first rqst has a transform header as the first iov.
+	 * I.e.
+	 * rqst[0].rq_iov[0]  is transform header
+	 * rqst[0].rq_iov[1+] data to be encrypted/decrypted
+	 * rqst[1+].rq_iov[0+] data to be encrypted/decrypted
+	 */
 	for (i = 0; i < num_rqst; i++) {
+		/*
+		 * The first rqst has a transform header where the
+		 * first 20 bytes are not part of the encrypted blob.
+		 */
 		for (j = 0; j < rqst[i].rq_nvec; j++) {
-			/*
-			 * The first rqst has a transform header where the
-			 * first 20 bytes are not part of the encrypted blob
-			 */
-			skip = (i == 0) && (j == 0) ? 20 : 0;
-			smb2_sg_set_buf(&sg[idx++],
-					rqst[i].rq_iov[j].iov_base + skip,
-					rqst[i].rq_iov[j].iov_len - skip);
-			}
+			struct kvec *iov = &rqst[i].rq_iov[j];
 
+			skip = (i == 0) && (j == 0) ? 20 : 0;
+			addr = (unsigned long)iov->iov_base + skip;
+			len = iov->iov_len - skip;
+			sg = cifs_sg_set_buf(sg, (void *)addr, len);
+		}
 		for (j = 0; j < rqst[i].rq_npages; j++) {
-			unsigned int len, offset;
-
-			rqst_page_get_length(&rqst[i], j, &len, &offset);
-			sg_set_page(&sg[idx++], rqst[i].rq_pages[j], len, offset);
+			rqst_page_get_length(&rqst[i], j, &len, &off);
+			sg_set_page(sg++, rqst[i].rq_pages[j], len, off);
 		}
 	}
-	smb2_sg_set_buf(&sg[idx], sign, SMB2_SIGNATURE_SIZE);
-	return sg;
+	cifs_sg_set_buf(sg, sig, SMB2_SIGNATURE_SIZE);
+
+	return p;
 }
 
 static int
@@ -4314,11 +4327,11 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
 	u8 sign[SMB2_SIGNATURE_SIZE] = {};
 	u8 key[SMB3_ENC_DEC_KEY_SIZE];
 	struct aead_request *req;
-	char *iv;
-	unsigned int iv_len;
+	u8 *iv;
 	DECLARE_CRYPTO_WAIT(wait);
 	struct crypto_aead *tfm;
 	unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize);
+	void *creq;
 
 	rc = smb2_get_enc_key(server, le64_to_cpu(tr_hdr->SessionId), enc, key);
 	if (rc) {
@@ -4352,32 +4365,15 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
 		return rc;
 	}
 
-	req = aead_request_alloc(tfm, GFP_KERNEL);
-	if (!req) {
-		cifs_server_dbg(VFS, "%s: Failed to alloc aead request\n", __func__);
+	creq = smb2_get_aead_req(tfm, rqst, num_rqst, sign, &iv, &req, &sg);
+	if (unlikely(!creq))
 		return -ENOMEM;
-	}
 
 	if (!enc) {
 		memcpy(sign, &tr_hdr->Signature, SMB2_SIGNATURE_SIZE);
 		crypt_len += SMB2_SIGNATURE_SIZE;
 	}
 
-	sg = init_sg(num_rqst, rqst, sign);
-	if (!sg) {
-		cifs_server_dbg(VFS, "%s: Failed to init sg\n", __func__);
-		rc = -ENOMEM;
-		goto free_req;
-	}
-
-	iv_len = crypto_aead_ivsize(tfm);
-	iv = kzalloc(iv_len, GFP_KERNEL);
-	if (!iv) {
-		cifs_server_dbg(VFS, "%s: Failed to alloc iv\n", __func__);
-		rc = -ENOMEM;
-		goto free_sg;
-	}
-
 	if ((server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) ||
 	    (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM))
 		memcpy(iv, (char *)tr_hdr->Nonce, SMB3_AES_GCM_NONCE);
@@ -4386,6 +4382,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
 		memcpy(iv + 1, (char *)tr_hdr->Nonce, SMB3_AES_CCM_NONCE);
 	}
 
+	aead_request_set_tfm(req, tfm);
 	aead_request_set_crypt(req, sg, sg, crypt_len, iv);
 	aead_request_set_ad(req, assoc_data_len);
 
@@ -4398,11 +4395,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
 	if (!rc && enc)
 		memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE);
 
-	kfree_sensitive(iv);
-free_sg:
-	kfree_sensitive(sg);
-free_req:
-	kfree_sensitive(req);
+	kfree_sensitive(creq);
 	return rc;
 }
 
-- 
cgit 


From a9438b44bc7015b18931e312bbd249a25bb59a65 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 12 Dec 2022 12:36:33 +0100
Subject: writeback: Add asserts for adding freed inode to lists

In the past we had several use-after-free issues with inodes getting
added to writeback lists after evict() removed them. These are painful
to debug so add some asserts to catch the problem earlier. The only
non-obvious change in the commit is that we need to tweak
redirty_tail_locked() to avoid triggering assertion in
inode_io_list_move_locked().

Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20221212113633.29181-1-jack@suse.cz
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/fs-writeback.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 443f83382b9b..6cd172c4cb3e 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -121,6 +121,7 @@ static bool inode_io_list_move_locked(struct inode *inode,
 {
 	assert_spin_locked(&wb->list_lock);
 	assert_spin_locked(&inode->i_lock);
+	WARN_ON_ONCE(inode->i_state & I_FREEING);
 
 	list_move(&inode->i_io_list, head);
 
@@ -280,6 +281,7 @@ static void inode_cgwb_move_to_attached(struct inode *inode,
 {
 	assert_spin_locked(&wb->list_lock);
 	assert_spin_locked(&inode->i_lock);
+	WARN_ON_ONCE(inode->i_state & I_FREEING);
 
 	inode->i_state &= ~I_SYNC_QUEUED;
 	if (wb != &wb->bdi->wb)
@@ -1129,6 +1131,7 @@ static void inode_cgwb_move_to_attached(struct inode *inode,
 {
 	assert_spin_locked(&wb->list_lock);
 	assert_spin_locked(&inode->i_lock);
+	WARN_ON_ONCE(inode->i_state & I_FREEING);
 
 	inode->i_state &= ~I_SYNC_QUEUED;
 	list_del_init(&inode->i_io_list);
@@ -1294,6 +1297,17 @@ static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb)
 {
 	assert_spin_locked(&inode->i_lock);
 
+	inode->i_state &= ~I_SYNC_QUEUED;
+	/*
+	 * When the inode is being freed just don't bother with dirty list
+	 * tracking. Flush worker will ignore this inode anyway and it will
+	 * trigger assertions in inode_io_list_move_locked().
+	 */
+	if (inode->i_state & I_FREEING) {
+		list_del_init(&inode->i_io_list);
+		wb_io_lists_depopulated(wb);
+		return;
+	}
 	if (!list_empty(&wb->b_dirty)) {
 		struct inode *tail;
 
@@ -1302,7 +1316,6 @@ static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb)
 			inode->dirtied_when = jiffies;
 	}
 	inode_io_list_move_locked(inode, wb, &wb->b_dirty);
-	inode->i_state &= ~I_SYNC_QUEUED;
 }
 
 static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
-- 
cgit 


From 23e188a16423a6e65290abf39dd427ff047e6843 Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Sat, 10 Dec 2022 18:10:42 +0800
Subject: writeback: remove obsolete macro EXPIRE_DIRTY_ATIME

EXPIRE_DIRTY_ATIME is not used anymore. Remove it.

Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20221210101042.2012931-1-linmiaohe@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/fs-writeback.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 6cd172c4cb3e..6c113585e782 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1358,8 +1358,6 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
 	return ret;
 }
 
-#define EXPIRE_DIRTY_ATIME 0x0001
-
 /*
  * Move expired (dirtied before dirtied_before) dirty inodes from
  * @delaying_queue to @dispatch_queue.
-- 
cgit 


From e480751970e84bc13ab5c288dbbe16b0638cc088 Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Thu, 24 Nov 2022 11:37:08 +0800
Subject: f2fs: remove F2FS_SET_FEATURE() and F2FS_CLEAR_FEATURE() macro

F2FS_SET_FEATURE() and F2FS_CLEAR_FEATURE() have never
been used since they were introduced by this commit
76f105a2dbcd("f2fs: add feature facility in superblock").

So let's remove them. BTW, convert f2fs_sb_has_##name to return bool.

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 296683648d4f..cf738f1275b2 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -203,10 +203,6 @@ struct f2fs_mount_info {
 #define __F2FS_HAS_FEATURE(raw_super, mask)				\
 	((raw_super->feature & cpu_to_le32(mask)) != 0)
 #define F2FS_HAS_FEATURE(sbi, mask)	__F2FS_HAS_FEATURE(sbi->raw_super, mask)
-#define F2FS_SET_FEATURE(sbi, mask)					\
-	(sbi->raw_super->feature |= cpu_to_le32(mask))
-#define F2FS_CLEAR_FEATURE(sbi, mask)					\
-	(sbi->raw_super->feature &= ~cpu_to_le32(mask))
 
 /*
  * Default values for user and/or group using reserved blocks
@@ -4387,7 +4383,7 @@ static inline bool f2fs_disable_compressed_file(struct inode *inode)
 }
 
 #define F2FS_FEATURE_FUNCS(name, flagname) \
-static inline int f2fs_sb_has_##name(struct f2fs_sb_info *sbi) \
+static inline bool f2fs_sb_has_##name(struct f2fs_sb_info *sbi) \
 { \
 	return F2FS_HAS_FEATURE(sbi, F2FS_FEATURE_##flagname); \
 }
-- 
cgit 


From ed8ac22b6b75804743f1dae6563d75f85cfd1483 Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Thu, 24 Nov 2022 10:48:42 +0800
Subject: f2fs: introduce f2fs_is_readonly() for readability

Introduce f2fs_is_readonly() and use it to simplify code.

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h  | 5 +++++
 fs/f2fs/super.c | 5 ++---
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index cf738f1275b2..eb8c27c4e5fc 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -4575,6 +4575,11 @@ static inline void f2fs_handle_page_eio(struct f2fs_sb_info *sbi, pgoff_t ofs,
 	}
 }
 
+static inline bool f2fs_is_readonly(struct f2fs_sb_info *sbi)
+{
+	return f2fs_sb_has_readonly(sbi) || f2fs_readonly(sbi->sb);
+}
+
 #define EFSBADCRC	EBADMSG		/* Bad CRC detected */
 #define EFSCORRUPTED	EUCLEAN		/* Filesystem is corrupted */
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index a5f6f632cf7c..79bf1faf4161 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1351,8 +1351,7 @@ default_check:
 		return -EINVAL;
 	}
 
-	if ((f2fs_sb_has_readonly(sbi) || f2fs_readonly(sbi->sb)) &&
-		test_opt(sbi, FLUSH_MERGE)) {
+	if (f2fs_is_readonly(sbi) && test_opt(sbi, FLUSH_MERGE)) {
 		f2fs_err(sbi, "FLUSH_MERGE not compatible with readonly mode");
 		return -EINVAL;
 	}
@@ -2083,7 +2082,7 @@ static void default_options(struct f2fs_sb_info *sbi)
 	set_opt(sbi, MERGE_CHECKPOINT);
 	F2FS_OPTION(sbi).unusable_cap = 0;
 	sbi->sb->s_flags |= SB_LAZYTIME;
-	if (!f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb))
+	if (!f2fs_is_readonly(sbi))
 		set_opt(sbi, FLUSH_MERGE);
 	if (f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi))
 		set_opt(sbi, DISCARD);
-- 
cgit 


From 12607c1ba7637e750402f555b6695c50fce77a2b Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 30 Nov 2022 09:36:43 -0800
Subject: f2fs: specify extent cache for read explicitly

Let's descrbie it's read extent cache.

Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/extent_cache.c |  4 ++--
 fs/f2fs/f2fs.h         | 10 +++++-----
 fs/f2fs/inode.c        |  2 +-
 fs/f2fs/node.c         |  2 +-
 fs/f2fs/node.h         |  2 +-
 fs/f2fs/segment.c      |  4 ++--
 fs/f2fs/super.c        | 12 ++++++------
 7 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 932c070173b9..8cd87aee0292 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -383,7 +383,7 @@ static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage)
 	if (!i_ext || !i_ext->len)
 		return;
 
-	get_extent_info(&ei, i_ext);
+	get_read_extent_info(&ei, i_ext);
 
 	write_lock(&et->lock);
 	if (atomic_read(&et->node_cnt))
@@ -710,7 +710,7 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
 	unsigned int node_cnt = 0, tree_cnt = 0;
 	int remained;
 
-	if (!test_opt(sbi, EXTENT_CACHE))
+	if (!test_opt(sbi, READ_EXTENT_CACHE))
 		return 0;
 
 	if (!atomic_read(&sbi->total_zombie_tree))
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index eb8c27c4e5fc..1c39f8145b61 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -92,7 +92,7 @@ extern const char *f2fs_fault_name[FAULT_MAX];
 #define F2FS_MOUNT_FLUSH_MERGE		0x00000400
 #define F2FS_MOUNT_NOBARRIER		0x00000800
 #define F2FS_MOUNT_FASTBOOT		0x00001000
-#define F2FS_MOUNT_EXTENT_CACHE		0x00002000
+#define F2FS_MOUNT_READ_EXTENT_CACHE	0x00002000
 #define F2FS_MOUNT_DATA_FLUSH		0x00008000
 #define F2FS_MOUNT_FAULT_INJECTION	0x00010000
 #define F2FS_MOUNT_USRQUOTA		0x00080000
@@ -600,7 +600,7 @@ enum {
 #define F2FS_MIN_EXTENT_LEN	64	/* minimum extent length */
 
 /* number of extent info in extent cache we try to shrink */
-#define EXTENT_CACHE_SHRINK_NUMBER	128
+#define READ_EXTENT_CACHE_SHRINK_NUMBER	128
 
 #define RECOVERY_MAX_RA_BLOCKS		BIO_MAX_VECS
 #define RECOVERY_MIN_RA_BLOCKS		1
@@ -830,7 +830,7 @@ struct f2fs_inode_info {
 	loff_t original_i_size;		/* original i_size before atomic write */
 };
 
-static inline void get_extent_info(struct extent_info *ext,
+static inline void get_read_extent_info(struct extent_info *ext,
 					struct f2fs_extent *i_ext)
 {
 	ext->fofs = le32_to_cpu(i_ext->fofs);
@@ -838,7 +838,7 @@ static inline void get_extent_info(struct extent_info *ext,
 	ext->len = le32_to_cpu(i_ext->len);
 }
 
-static inline void set_raw_extent(struct extent_info *ext,
+static inline void set_raw_read_extent(struct extent_info *ext,
 					struct f2fs_extent *i_ext)
 {
 	i_ext->fofs = cpu_to_le32(ext->fofs);
@@ -4407,7 +4407,7 @@ static inline bool f2fs_may_extent_tree(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 
-	if (!test_opt(sbi, EXTENT_CACHE) ||
+	if (!test_opt(sbi, READ_EXTENT_CACHE) ||
 			is_inode_flag_set(inode, FI_NO_EXTENT) ||
 			(is_inode_flag_set(inode, FI_COMPRESSED_FILE) &&
 			 !f2fs_sb_has_readonly(sbi)))
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 577f109b4e1d..2c705c60019b 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -629,7 +629,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
 
 	if (et) {
 		read_lock(&et->lock);
-		set_raw_extent(&et->largest, &ri->i_ext);
+		set_raw_read_extent(&et->largest, &ri->i_ext);
 		read_unlock(&et->lock);
 	} else {
 		memset(&ri->i_ext, 0, sizeof(ri->i_ext));
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index b9ee5a1176a0..84b147966080 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -85,7 +85,7 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
 						sizeof(struct ino_entry);
 		mem_size >>= PAGE_SHIFT;
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
-	} else if (type == EXTENT_CACHE) {
+	} else if (type == READ_EXTENT_CACHE) {
 		mem_size = (atomic_read(&sbi->total_ext_tree) *
 				sizeof(struct extent_tree) +
 				atomic_read(&sbi->total_ext_node) *
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 3c09cae058b0..0aa48704c77a 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -146,7 +146,7 @@ enum mem_type {
 	NAT_ENTRIES,	/* indicates the cached nat entry */
 	DIRTY_DENTS,	/* indicates dirty dentry pages */
 	INO_ENTRIES,	/* indicates inode entries */
-	EXTENT_CACHE,	/* indicates extent cache */
+	READ_EXTENT_CACHE,	/* indicates read extent cache */
 	DISCARD_CACHE,	/* indicates memory of cached discard cmds */
 	COMPRESS_PAGE,	/* indicates memory of cached compressed pages */
 	BASE_CHECK,	/* check kernel status */
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 9486ca49ecb1..51de358bc452 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -449,8 +449,8 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg)
 		return;
 
 	/* try to shrink extent cache when there is no enough memory */
-	if (!f2fs_available_free_memory(sbi, EXTENT_CACHE))
-		f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER);
+	if (!f2fs_available_free_memory(sbi, READ_EXTENT_CACHE))
+		f2fs_shrink_extent_tree(sbi, READ_EXTENT_CACHE_SHRINK_NUMBER);
 
 	/* check the # of cached NAT entries */
 	if (!f2fs_available_free_memory(sbi, NAT_ENTRIES))
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 79bf1faf4161..412c2e7352c0 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -814,10 +814,10 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
 			set_opt(sbi, FASTBOOT);
 			break;
 		case Opt_extent_cache:
-			set_opt(sbi, EXTENT_CACHE);
+			set_opt(sbi, READ_EXTENT_CACHE);
 			break;
 		case Opt_noextent_cache:
-			clear_opt(sbi, EXTENT_CACHE);
+			clear_opt(sbi, READ_EXTENT_CACHE);
 			break;
 		case Opt_noinline_data:
 			clear_opt(sbi, INLINE_DATA);
@@ -1954,7 +1954,7 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 		seq_puts(seq, ",barrier");
 	if (test_opt(sbi, FASTBOOT))
 		seq_puts(seq, ",fastboot");
-	if (test_opt(sbi, EXTENT_CACHE))
+	if (test_opt(sbi, READ_EXTENT_CACHE))
 		seq_puts(seq, ",extent_cache");
 	else
 		seq_puts(seq, ",noextent_cache");
@@ -2076,7 +2076,7 @@ static void default_options(struct f2fs_sb_info *sbi)
 	set_opt(sbi, INLINE_XATTR);
 	set_opt(sbi, INLINE_DATA);
 	set_opt(sbi, INLINE_DENTRY);
-	set_opt(sbi, EXTENT_CACHE);
+	set_opt(sbi, READ_EXTENT_CACHE);
 	set_opt(sbi, NOHEAP);
 	clear_opt(sbi, DISABLE_CHECKPOINT);
 	set_opt(sbi, MERGE_CHECKPOINT);
@@ -2218,7 +2218,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	bool need_restart_ckpt = false, need_stop_ckpt = false;
 	bool need_restart_flush = false, need_stop_flush = false;
 	bool need_restart_discard = false, need_stop_discard = false;
-	bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE);
+	bool no_read_extent_cache = !test_opt(sbi, READ_EXTENT_CACHE);
 	bool enable_checkpoint = !test_opt(sbi, DISABLE_CHECKPOINT);
 	bool no_io_align = !F2FS_IO_ALIGNED(sbi);
 	bool no_atgc = !test_opt(sbi, ATGC);
@@ -2308,7 +2308,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	}
 
 	/* disallow enable/disable extent_cache dynamically */
-	if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) {
+	if (no_read_extent_cache == !!test_opt(sbi, READ_EXTENT_CACHE)) {
 		err = -EINVAL;
 		f2fs_warn(sbi, "switch extent_cache option is not allowed");
 		goto restore_opts;
-- 
cgit 


From 3bac20a8f011b8ed4012b43f4f33010432b3c647 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 30 Nov 2022 09:44:58 -0800
Subject: f2fs: move internal functions into extent_cache.c

No functional change.

Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/extent_cache.c | 88 ++++++++++++++++++++++++++++++++++++++++++++------
 fs/f2fs/f2fs.h         | 69 ++-------------------------------------
 2 files changed, 81 insertions(+), 76 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 8cd87aee0292..2a8e31e6d518 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -15,6 +15,77 @@
 #include "node.h"
 #include <trace/events/f2fs.h>
 
+static void __set_extent_info(struct extent_info *ei,
+				unsigned int fofs, unsigned int len,
+				block_t blk, bool keep_clen)
+{
+	ei->fofs = fofs;
+	ei->blk = blk;
+	ei->len = len;
+
+	if (keep_clen)
+		return;
+
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+	ei->c_len = 0;
+#endif
+}
+
+static bool f2fs_may_extent_tree(struct inode *inode)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+	/*
+	 * for recovered files during mount do not create extents
+	 * if shrinker is not registered.
+	 */
+	if (list_empty(&sbi->s_list))
+		return false;
+
+	if (!test_opt(sbi, READ_EXTENT_CACHE) ||
+			is_inode_flag_set(inode, FI_NO_EXTENT) ||
+			(is_inode_flag_set(inode, FI_COMPRESSED_FILE) &&
+			 !f2fs_sb_has_readonly(sbi)))
+		return false;
+
+	return S_ISREG(inode->i_mode);
+}
+
+static void __try_update_largest_extent(struct extent_tree *et,
+						struct extent_node *en)
+{
+	if (en->ei.len <= et->largest.len)
+		return;
+
+	et->largest = en->ei;
+	et->largest_updated = true;
+}
+
+static bool __is_extent_mergeable(struct extent_info *back,
+				struct extent_info *front)
+{
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+	if (back->c_len && back->len != back->c_len)
+		return false;
+	if (front->c_len && front->len != front->c_len)
+		return false;
+#endif
+	return (back->fofs + back->len == front->fofs &&
+			back->blk + back->len == front->blk);
+}
+
+static bool __is_back_mergeable(struct extent_info *cur,
+				struct extent_info *back)
+{
+	return __is_extent_mergeable(back, cur);
+}
+
+static bool __is_front_mergeable(struct extent_info *cur,
+				struct extent_info *front)
+{
+	return __is_extent_mergeable(cur, front);
+}
+
 static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re,
 							unsigned int ofs)
 {
@@ -591,16 +662,16 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
 
 		if (end < org_end && org_end - end >= F2FS_MIN_EXTENT_LEN) {
 			if (parts) {
-				set_extent_info(&ei, end,
-						end - dei.fofs + dei.blk,
-						org_end - end);
+				__set_extent_info(&ei,
+					end, org_end - end,
+					end - dei.fofs + dei.blk, false);
 				en1 = __insert_extent_tree(sbi, et, &ei,
 							NULL, NULL, true);
 				next_en = en1;
 			} else {
-				en->ei.fofs = end;
-				en->ei.blk += end - dei.fofs;
-				en->ei.len -= end - dei.fofs;
+				__set_extent_info(&en->ei,
+					end, en->ei.len - (end - dei.fofs),
+					en->ei.blk + (end - dei.fofs), true);
 				next_en = en;
 			}
 			parts++;
@@ -632,8 +703,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
 
 	/* 3. update extent in extent cache */
 	if (blkaddr) {
-
-		set_extent_info(&ei, fofs, blkaddr, len);
+		__set_extent_info(&ei, fofs, len, blkaddr, false);
 		if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
 			__insert_extent_tree(sbi, et, &ei,
 					insert_p, insert_parent, leftmost);
@@ -692,7 +762,7 @@ void f2fs_update_extent_tree_range_compressed(struct inode *inode,
 	if (en)
 		goto unlock_out;
 
-	set_extent_info(&ei, fofs, blkaddr, llen);
+	__set_extent_info(&ei, fofs, llen, blkaddr, true);
 	ei.c_len = c_len;
 
 	if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 1c39f8145b61..04fdf010bb77 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -621,7 +621,7 @@ struct rb_entry {
 struct extent_info {
 	unsigned int fofs;		/* start offset in a file */
 	unsigned int len;		/* length of the extent */
-	u32 blk;			/* start block address of the extent */
+	block_t blk;			/* start block address of the extent */
 #ifdef CONFIG_F2FS_FS_COMPRESSION
 	unsigned int c_len;		/* physical extent length of compressed blocks */
 #endif
@@ -846,17 +846,6 @@ static inline void set_raw_read_extent(struct extent_info *ext,
 	i_ext->len = cpu_to_le32(ext->len);
 }
 
-static inline void set_extent_info(struct extent_info *ei, unsigned int fofs,
-						u32 blk, unsigned int len)
-{
-	ei->fofs = fofs;
-	ei->blk = blk;
-	ei->len = len;
-#ifdef CONFIG_F2FS_FS_COMPRESSION
-	ei->c_len = 0;
-#endif
-}
-
 static inline bool __is_discard_mergeable(struct discard_info *back,
 			struct discard_info *front, unsigned int max_len)
 {
@@ -876,41 +865,6 @@ static inline bool __is_discard_front_mergeable(struct discard_info *cur,
 	return __is_discard_mergeable(cur, front, max_len);
 }
 
-static inline bool __is_extent_mergeable(struct extent_info *back,
-						struct extent_info *front)
-{
-#ifdef CONFIG_F2FS_FS_COMPRESSION
-	if (back->c_len && back->len != back->c_len)
-		return false;
-	if (front->c_len && front->len != front->c_len)
-		return false;
-#endif
-	return (back->fofs + back->len == front->fofs &&
-			back->blk + back->len == front->blk);
-}
-
-static inline bool __is_back_mergeable(struct extent_info *cur,
-						struct extent_info *back)
-{
-	return __is_extent_mergeable(back, cur);
-}
-
-static inline bool __is_front_mergeable(struct extent_info *cur,
-						struct extent_info *front)
-{
-	return __is_extent_mergeable(cur, front);
-}
-
-extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync);
-static inline void __try_update_largest_extent(struct extent_tree *et,
-						struct extent_node *en)
-{
-	if (en->ei.len > et->largest.len) {
-		et->largest = en->ei;
-		et->largest_updated = true;
-	}
-}
-
 /*
  * For free nid management
  */
@@ -2581,6 +2535,7 @@ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi)
 	return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum);
 }
 
+extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync);
 static inline int inc_valid_node_count(struct f2fs_sb_info *sbi,
 					struct inode *inode, bool is_inode)
 {
@@ -4403,26 +4358,6 @@ F2FS_FEATURE_FUNCS(casefold, CASEFOLD);
 F2FS_FEATURE_FUNCS(compression, COMPRESSION);
 F2FS_FEATURE_FUNCS(readonly, RO);
 
-static inline bool f2fs_may_extent_tree(struct inode *inode)
-{
-	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-
-	if (!test_opt(sbi, READ_EXTENT_CACHE) ||
-			is_inode_flag_set(inode, FI_NO_EXTENT) ||
-			(is_inode_flag_set(inode, FI_COMPRESSED_FILE) &&
-			 !f2fs_sb_has_readonly(sbi)))
-		return false;
-
-	/*
-	 * for recovered files during mount do not create extents
-	 * if shrinker is not registered.
-	 */
-	if (list_empty(&sbi->s_list))
-		return false;
-
-	return S_ISREG(inode->i_mode);
-}
-
 #ifdef CONFIG_BLK_DEV_ZONED
 static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi,
 				    block_t blkaddr)
-- 
cgit 


From 749d543c0d451fff31e8f7a3e0a031ffcbf1ebb1 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 30 Nov 2022 10:01:18 -0800
Subject: f2fs: remove unnecessary __init_extent_tree

Added into the caller.

Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/extent_cache.c | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 2a8e31e6d518..c6810347e205 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -386,21 +386,6 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode)
 	return et;
 }
 
-static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi,
-				struct extent_tree *et, struct extent_info *ei)
-{
-	struct rb_node **p = &et->root.rb_root.rb_node;
-	struct extent_node *en;
-
-	en = __attach_extent_node(sbi, et, ei, NULL, p, true);
-	if (!en)
-		return NULL;
-
-	et->largest = en->ei;
-	et->cached_en = en;
-	return en;
-}
-
 static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi,
 					struct extent_tree *et)
 {
@@ -460,8 +445,12 @@ static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage)
 	if (atomic_read(&et->node_cnt))
 		goto out;
 
-	en = __init_extent_tree(sbi, et, &ei);
+	en = __attach_extent_node(sbi, et, &ei, NULL,
+				&et->root.rb_root.rb_node, true);
 	if (en) {
+		et->largest = en->ei;
+		et->cached_en = en;
+
 		spin_lock(&sbi->extent_lock);
 		list_add_tail(&en->list, &sbi->extent_list);
 		spin_unlock(&sbi->extent_lock);
-- 
cgit 


From e7547daccd6a37522f0af74ec4b5a3036f3dd328 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 30 Nov 2022 09:26:29 -0800
Subject: f2fs: refactor extent_cache to support for read and more

This patch prepares extent_cache to be ready for addition.

Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c         |  20 +--
 fs/f2fs/debug.c        |  65 ++++---
 fs/f2fs/extent_cache.c | 463 +++++++++++++++++++++++++++++--------------------
 fs/f2fs/f2fs.h         | 119 ++++++++-----
 fs/f2fs/file.c         |   8 +-
 fs/f2fs/gc.c           |   4 +-
 fs/f2fs/inode.c        |   6 +-
 fs/f2fs/node.c         |   8 +-
 fs/f2fs/segment.c      |   3 +-
 fs/f2fs/shrinker.c     |  19 +-
 10 files changed, 434 insertions(+), 281 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 35c19248b1e2..75abd450730b 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1126,7 +1126,7 @@ void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr)
 {
 	dn->data_blkaddr = blkaddr;
 	f2fs_set_data_blkaddr(dn);
-	f2fs_update_extent_cache(dn);
+	f2fs_update_read_extent_cache(dn);
 }
 
 /* dn->ofs_in_node will be returned with up-to-date last block pointer */
@@ -1195,7 +1195,7 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index)
 	struct extent_info ei = {0, };
 	struct inode *inode = dn->inode;
 
-	if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+	if (f2fs_lookup_read_extent_cache(inode, index, &ei)) {
 		dn->data_blkaddr = ei.blk + index - ei.fofs;
 		return 0;
 	}
@@ -1217,7 +1217,7 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
 	if (!page)
 		return ERR_PTR(-ENOMEM);
 
-	if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+	if (f2fs_lookup_read_extent_cache(inode, index, &ei)) {
 		dn.data_blkaddr = ei.blk + index - ei.fofs;
 		if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr,
 						DATA_GENERIC_ENHANCE_READ)) {
@@ -1485,7 +1485,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
 	pgofs =	(pgoff_t)map->m_lblk;
 	end = pgofs + maxblocks;
 
-	if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) {
+	if (!create && f2fs_lookup_read_extent_cache(inode, pgofs, &ei)) {
 		if (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO &&
 							map->m_may_create)
 			goto next_dnode;
@@ -1695,7 +1695,7 @@ skip:
 		if (map->m_flags & F2FS_MAP_MAPPED) {
 			unsigned int ofs = start_pgofs - map->m_lblk;
 
-			f2fs_update_extent_cache_range(&dn,
+			f2fs_update_read_extent_cache_range(&dn,
 				start_pgofs, map->m_pblk + ofs,
 				map->m_len - ofs);
 		}
@@ -1740,7 +1740,7 @@ sync_out:
 		if (map->m_flags & F2FS_MAP_MAPPED) {
 			unsigned int ofs = start_pgofs - map->m_lblk;
 
-			f2fs_update_extent_cache_range(&dn,
+			f2fs_update_read_extent_cache_range(&dn,
 				start_pgofs, map->m_pblk + ofs,
 				map->m_len - ofs);
 		}
@@ -2201,7 +2201,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
 	if (f2fs_cluster_is_empty(cc))
 		goto out;
 
-	if (f2fs_lookup_extent_cache(inode, start_idx, &ei))
+	if (f2fs_lookup_read_extent_cache(inode, start_idx, &ei))
 		from_dnode = false;
 
 	if (!from_dnode)
@@ -2635,7 +2635,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio)
 		set_new_dnode(&dn, inode, NULL, NULL, 0);
 
 	if (need_inplace_update(fio) &&
-			f2fs_lookup_extent_cache(inode, page->index, &ei)) {
+	    f2fs_lookup_read_extent_cache(inode, page->index, &ei)) {
 		fio->old_blkaddr = ei.blk + page->index - ei.fofs;
 
 		if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr,
@@ -3359,7 +3359,7 @@ restart:
 	} else if (locked) {
 		err = f2fs_get_block(&dn, index);
 	} else {
-		if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+		if (f2fs_lookup_read_extent_cache(inode, index, &ei)) {
 			dn.data_blkaddr = ei.blk + index - ei.fofs;
 		} else {
 			/* hole case */
@@ -3400,7 +3400,7 @@ static int __find_data_block(struct inode *inode, pgoff_t index,
 
 	set_new_dnode(&dn, inode, ipage, ipage, 0);
 
-	if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+	if (f2fs_lookup_read_extent_cache(inode, index, &ei)) {
 		dn.data_blkaddr = ei.blk + index - ei.fofs;
 	} else {
 		/* hole case */
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index a216dcdf6941..a9baa121d829 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -72,15 +72,23 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 	si->main_area_zones = si->main_area_sections /
 				le32_to_cpu(raw_super->secs_per_zone);
 
-	/* validation check of the segment numbers */
+	/* general extent cache stats */
+	for (i = 0; i < NR_EXTENT_CACHES; i++) {
+		struct extent_tree_info *eti = &sbi->extent_tree[i];
+
+		si->hit_cached[i] = atomic64_read(&sbi->read_hit_cached[i]);
+		si->hit_rbtree[i] = atomic64_read(&sbi->read_hit_rbtree[i]);
+		si->total_ext[i] = atomic64_read(&sbi->total_hit_ext[i]);
+		si->hit_total[i] = si->hit_cached[i] + si->hit_rbtree[i];
+		si->ext_tree[i] = atomic_read(&eti->total_ext_tree);
+		si->zombie_tree[i] = atomic_read(&eti->total_zombie_tree);
+		si->ext_node[i] = atomic_read(&eti->total_ext_node);
+	}
+	/* read extent_cache only */
 	si->hit_largest = atomic64_read(&sbi->read_hit_largest);
-	si->hit_cached = atomic64_read(&sbi->read_hit_cached);
-	si->hit_rbtree = atomic64_read(&sbi->read_hit_rbtree);
-	si->hit_total = si->hit_largest + si->hit_cached + si->hit_rbtree;
-	si->total_ext = atomic64_read(&sbi->total_hit_ext);
-	si->ext_tree = atomic_read(&sbi->total_ext_tree);
-	si->zombie_tree = atomic_read(&sbi->total_zombie_tree);
-	si->ext_node = atomic_read(&sbi->total_ext_node);
+	si->hit_total[EX_READ] += si->hit_largest;
+
+	/* validation check of the segment numbers */
 	si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES);
 	si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS);
 	si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META);
@@ -294,10 +302,16 @@ get_cache:
 				sizeof(struct nat_entry_set);
 	for (i = 0; i < MAX_INO_ENTRY; i++)
 		si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry);
-	si->cache_mem += atomic_read(&sbi->total_ext_tree) *
+
+	for (i = 0; i < NR_EXTENT_CACHES; i++) {
+		struct extent_tree_info *eti = &sbi->extent_tree[i];
+
+		si->ext_mem[i] = atomic_read(&eti->total_ext_tree) *
 						sizeof(struct extent_tree);
-	si->cache_mem += atomic_read(&sbi->total_ext_node) *
+		si->ext_mem[i] += atomic_read(&eti->total_ext_node) *
 						sizeof(struct extent_node);
+		si->cache_mem += si->ext_mem[i];
+	}
 
 	si->page_mem = 0;
 	if (sbi->node_inode) {
@@ -490,16 +504,18 @@ static int stat_show(struct seq_file *s, void *v)
 				si->bg_node_blks);
 		seq_printf(s, "BG skip : IO: %u, Other: %u\n",
 				si->io_skip_bggc, si->other_skip_bggc);
-		seq_puts(s, "\nExtent Cache:\n");
+		seq_puts(s, "\nExtent Cache (Read):\n");
 		seq_printf(s, "  - Hit Count: L1-1:%llu L1-2:%llu L2:%llu\n",
-				si->hit_largest, si->hit_cached,
-				si->hit_rbtree);
+				si->hit_largest, si->hit_cached[EX_READ],
+				si->hit_rbtree[EX_READ]);
 		seq_printf(s, "  - Hit Ratio: %llu%% (%llu / %llu)\n",
-				!si->total_ext ? 0 :
-				div64_u64(si->hit_total * 100, si->total_ext),
-				si->hit_total, si->total_ext);
+				!si->total_ext[EX_READ] ? 0 :
+				div64_u64(si->hit_total[EX_READ] * 100,
+				si->total_ext[EX_READ]),
+				si->hit_total[EX_READ], si->total_ext[EX_READ]);
 		seq_printf(s, "  - Inner Struct Count: tree: %d(%d), node: %d\n",
-				si->ext_tree, si->zombie_tree, si->ext_node);
+				si->ext_tree[EX_READ], si->zombie_tree[EX_READ],
+				si->ext_node[EX_READ]);
 		seq_puts(s, "\nBalancing F2FS Async:\n");
 		seq_printf(s, "  - DIO (R: %4d, W: %4d)\n",
 			   si->nr_dio_read, si->nr_dio_write);
@@ -566,8 +582,10 @@ static int stat_show(struct seq_file *s, void *v)
 			(si->base_mem + si->cache_mem + si->page_mem) >> 10);
 		seq_printf(s, "  - static: %llu KB\n",
 				si->base_mem >> 10);
-		seq_printf(s, "  - cached: %llu KB\n",
+		seq_printf(s, "  - cached all: %llu KB\n",
 				si->cache_mem >> 10);
+		seq_printf(s, "  - read extent cache: %llu KB\n",
+				si->ext_mem[EX_READ] >> 10);
 		seq_printf(s, "  - paged : %llu KB\n",
 				si->page_mem >> 10);
 	}
@@ -600,10 +618,15 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
 	si->sbi = sbi;
 	sbi->stat_info = si;
 
-	atomic64_set(&sbi->total_hit_ext, 0);
-	atomic64_set(&sbi->read_hit_rbtree, 0);
+	/* general extent cache stats */
+	for (i = 0; i < NR_EXTENT_CACHES; i++) {
+		atomic64_set(&sbi->total_hit_ext[i], 0);
+		atomic64_set(&sbi->read_hit_rbtree[i], 0);
+		atomic64_set(&sbi->read_hit_cached[i], 0);
+	}
+
+	/* read extent_cache only */
 	atomic64_set(&sbi->read_hit_largest, 0);
-	atomic64_set(&sbi->read_hit_cached, 0);
 
 	atomic_set(&sbi->inline_xattr, 0);
 	atomic_set(&sbi->inline_inode, 0);
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index c6810347e205..654a14ab8977 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -17,21 +17,37 @@
 
 static void __set_extent_info(struct extent_info *ei,
 				unsigned int fofs, unsigned int len,
-				block_t blk, bool keep_clen)
+				block_t blk, bool keep_clen,
+				enum extent_type type)
 {
 	ei->fofs = fofs;
-	ei->blk = blk;
 	ei->len = len;
 
-	if (keep_clen)
-		return;
-
+	if (type == EX_READ) {
+		ei->blk = blk;
+		if (keep_clen)
+			return;
 #ifdef CONFIG_F2FS_FS_COMPRESSION
-	ei->c_len = 0;
+		ei->c_len = 0;
 #endif
+	}
+}
+
+static bool __may_read_extent_tree(struct inode *inode)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+	if (!test_opt(sbi, READ_EXTENT_CACHE))
+		return false;
+	if (is_inode_flag_set(inode, FI_NO_EXTENT))
+		return false;
+	if (is_inode_flag_set(inode, FI_COMPRESSED_FILE) &&
+			 !f2fs_sb_has_readonly(sbi))
+		return false;
+	return S_ISREG(inode->i_mode);
 }
 
-static bool f2fs_may_extent_tree(struct inode *inode)
+static bool __may_extent_tree(struct inode *inode, enum extent_type type)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 
@@ -42,18 +58,16 @@ static bool f2fs_may_extent_tree(struct inode *inode)
 	if (list_empty(&sbi->s_list))
 		return false;
 
-	if (!test_opt(sbi, READ_EXTENT_CACHE) ||
-			is_inode_flag_set(inode, FI_NO_EXTENT) ||
-			(is_inode_flag_set(inode, FI_COMPRESSED_FILE) &&
-			 !f2fs_sb_has_readonly(sbi)))
-		return false;
-
-	return S_ISREG(inode->i_mode);
+	if (type == EX_READ)
+		return __may_read_extent_tree(inode);
+	return false;
 }
 
 static void __try_update_largest_extent(struct extent_tree *et,
 						struct extent_node *en)
 {
+	if (et->type != EX_READ)
+		return;
 	if (en->ei.len <= et->largest.len)
 		return;
 
@@ -62,28 +76,31 @@ static void __try_update_largest_extent(struct extent_tree *et,
 }
 
 static bool __is_extent_mergeable(struct extent_info *back,
-				struct extent_info *front)
+		struct extent_info *front, enum extent_type type)
 {
+	if (type == EX_READ) {
 #ifdef CONFIG_F2FS_FS_COMPRESSION
-	if (back->c_len && back->len != back->c_len)
-		return false;
-	if (front->c_len && front->len != front->c_len)
-		return false;
+		if (back->c_len && back->len != back->c_len)
+			return false;
+		if (front->c_len && front->len != front->c_len)
+			return false;
 #endif
-	return (back->fofs + back->len == front->fofs &&
-			back->blk + back->len == front->blk);
+		return (back->fofs + back->len == front->fofs &&
+				back->blk + back->len == front->blk);
+	}
+	return false;
 }
 
 static bool __is_back_mergeable(struct extent_info *cur,
-				struct extent_info *back)
+		struct extent_info *back, enum extent_type type)
 {
-	return __is_extent_mergeable(back, cur);
+	return __is_extent_mergeable(back, cur, type);
 }
 
 static bool __is_front_mergeable(struct extent_info *cur,
-				struct extent_info *front)
+		struct extent_info *front, enum extent_type type)
 {
-	return __is_extent_mergeable(cur, front);
+	return __is_extent_mergeable(cur, front, type);
 }
 
 static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re,
@@ -308,6 +325,7 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi,
 				struct rb_node *parent, struct rb_node **p,
 				bool leftmost)
 {
+	struct extent_tree_info *eti = &sbi->extent_tree[et->type];
 	struct extent_node *en;
 
 	en = f2fs_kmem_cache_alloc(extent_node_slab, GFP_ATOMIC, false, sbi);
@@ -321,16 +339,18 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi,
 	rb_link_node(&en->rb_node, parent, p);
 	rb_insert_color_cached(&en->rb_node, &et->root, leftmost);
 	atomic_inc(&et->node_cnt);
-	atomic_inc(&sbi->total_ext_node);
+	atomic_inc(&eti->total_ext_node);
 	return en;
 }
 
 static void __detach_extent_node(struct f2fs_sb_info *sbi,
 				struct extent_tree *et, struct extent_node *en)
 {
+	struct extent_tree_info *eti = &sbi->extent_tree[et->type];
+
 	rb_erase_cached(&en->rb_node, &et->root);
 	atomic_dec(&et->node_cnt);
-	atomic_dec(&sbi->total_ext_node);
+	atomic_dec(&eti->total_ext_node);
 
 	if (et->cached_en == en)
 		et->cached_en = NULL;
@@ -346,42 +366,47 @@ static void __detach_extent_node(struct f2fs_sb_info *sbi,
 static void __release_extent_node(struct f2fs_sb_info *sbi,
 			struct extent_tree *et, struct extent_node *en)
 {
-	spin_lock(&sbi->extent_lock);
+	struct extent_tree_info *eti = &sbi->extent_tree[et->type];
+
+	spin_lock(&eti->extent_lock);
 	f2fs_bug_on(sbi, list_empty(&en->list));
 	list_del_init(&en->list);
-	spin_unlock(&sbi->extent_lock);
+	spin_unlock(&eti->extent_lock);
 
 	__detach_extent_node(sbi, et, en);
 }
 
-static struct extent_tree *__grab_extent_tree(struct inode *inode)
+static struct extent_tree *__grab_extent_tree(struct inode *inode,
+						enum extent_type type)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct extent_tree_info *eti = &sbi->extent_tree[type];
 	struct extent_tree *et;
 	nid_t ino = inode->i_ino;
 
-	mutex_lock(&sbi->extent_tree_lock);
-	et = radix_tree_lookup(&sbi->extent_tree_root, ino);
+	mutex_lock(&eti->extent_tree_lock);
+	et = radix_tree_lookup(&eti->extent_tree_root, ino);
 	if (!et) {
 		et = f2fs_kmem_cache_alloc(extent_tree_slab,
 					GFP_NOFS, true, NULL);
-		f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et);
+		f2fs_radix_tree_insert(&eti->extent_tree_root, ino, et);
 		memset(et, 0, sizeof(struct extent_tree));
 		et->ino = ino;
+		et->type = type;
 		et->root = RB_ROOT_CACHED;
 		et->cached_en = NULL;
 		rwlock_init(&et->lock);
 		INIT_LIST_HEAD(&et->list);
 		atomic_set(&et->node_cnt, 0);
-		atomic_inc(&sbi->total_ext_tree);
+		atomic_inc(&eti->total_ext_tree);
 	} else {
-		atomic_dec(&sbi->total_zombie_tree);
+		atomic_dec(&eti->total_zombie_tree);
 		list_del_init(&et->list);
 	}
-	mutex_unlock(&sbi->extent_tree_lock);
+	mutex_unlock(&eti->extent_tree_lock);
 
 	/* never died until evict_inode */
-	F2FS_I(inode)->extent_tree = et;
+	F2FS_I(inode)->extent_tree[type] = et;
 
 	return et;
 }
@@ -415,35 +440,38 @@ static void __drop_largest_extent(struct extent_tree *et,
 }
 
 /* return true, if inode page is changed */
-static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage)
+static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage,
+							enum extent_type type)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct extent_tree_info *eti = &sbi->extent_tree[type];
 	struct f2fs_extent *i_ext = ipage ? &F2FS_INODE(ipage)->i_ext : NULL;
 	struct extent_tree *et;
 	struct extent_node *en;
 	struct extent_info ei;
 
-	if (!f2fs_may_extent_tree(inode)) {
-		/* drop largest extent */
-		if (i_ext && i_ext->len) {
+	if (!__may_extent_tree(inode, type)) {
+		/* drop largest read extent */
+		if (type == EX_READ && i_ext && i_ext->len) {
 			f2fs_wait_on_page_writeback(ipage, NODE, true, true);
 			i_ext->len = 0;
 			set_page_dirty(ipage);
-			return;
 		}
-		return;
+		goto out;
 	}
 
-	et = __grab_extent_tree(inode);
+	et = __grab_extent_tree(inode, type);
 
 	if (!i_ext || !i_ext->len)
-		return;
+		goto out;
+
+	BUG_ON(type != EX_READ);
 
 	get_read_extent_info(&ei, i_ext);
 
 	write_lock(&et->lock);
 	if (atomic_read(&et->node_cnt))
-		goto out;
+		goto unlock_out;
 
 	en = __attach_extent_node(sbi, et, &ei, NULL,
 				&et->root.rb_root.rb_node, true);
@@ -451,37 +479,40 @@ static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage)
 		et->largest = en->ei;
 		et->cached_en = en;
 
-		spin_lock(&sbi->extent_lock);
-		list_add_tail(&en->list, &sbi->extent_list);
-		spin_unlock(&sbi->extent_lock);
+		spin_lock(&eti->extent_lock);
+		list_add_tail(&en->list, &eti->extent_list);
+		spin_unlock(&eti->extent_lock);
 	}
-out:
+unlock_out:
 	write_unlock(&et->lock);
+out:
+	if (type == EX_READ && !F2FS_I(inode)->extent_tree[EX_READ])
+		set_inode_flag(inode, FI_NO_EXTENT);
 }
 
 void f2fs_init_extent_tree(struct inode *inode, struct page *ipage)
 {
-	__f2fs_init_extent_tree(inode, ipage);
-
-	if (!F2FS_I(inode)->extent_tree)
-		set_inode_flag(inode, FI_NO_EXTENT);
+	/* initialize read cache */
+	__f2fs_init_extent_tree(inode, ipage, EX_READ);
 }
 
-static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
-							struct extent_info *ei)
+static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
+			struct extent_info *ei, enum extent_type type)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct extent_tree *et = F2FS_I(inode)->extent_tree;
+	struct extent_tree_info *eti = &sbi->extent_tree[type];
+	struct extent_tree *et = F2FS_I(inode)->extent_tree[type];
 	struct extent_node *en;
 	bool ret = false;
 
 	f2fs_bug_on(sbi, !et);
 
-	trace_f2fs_lookup_extent_tree_start(inode, pgofs);
+	trace_f2fs_lookup_extent_tree_start(inode, pgofs, type);
 
 	read_lock(&et->lock);
 
-	if (et->largest.fofs <= pgofs &&
+	if (type == EX_READ &&
+			et->largest.fofs <= pgofs &&
 			et->largest.fofs + et->largest.len > pgofs) {
 		*ei = et->largest;
 		ret = true;
@@ -495,23 +526,24 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
 		goto out;
 
 	if (en == et->cached_en)
-		stat_inc_cached_node_hit(sbi);
+		stat_inc_cached_node_hit(sbi, type);
 	else
-		stat_inc_rbtree_node_hit(sbi);
+		stat_inc_rbtree_node_hit(sbi, type);
 
 	*ei = en->ei;
-	spin_lock(&sbi->extent_lock);
+	spin_lock(&eti->extent_lock);
 	if (!list_empty(&en->list)) {
-		list_move_tail(&en->list, &sbi->extent_list);
+		list_move_tail(&en->list, &eti->extent_list);
 		et->cached_en = en;
 	}
-	spin_unlock(&sbi->extent_lock);
+	spin_unlock(&eti->extent_lock);
 	ret = true;
 out:
-	stat_inc_total_hit(sbi);
+	stat_inc_total_hit(sbi, type);
 	read_unlock(&et->lock);
 
-	trace_f2fs_lookup_extent_tree_end(inode, pgofs, ei);
+	if (type == EX_READ)
+		trace_f2fs_lookup_read_extent_tree_end(inode, pgofs, ei);
 	return ret;
 }
 
@@ -520,18 +552,20 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi,
 				struct extent_node *prev_ex,
 				struct extent_node *next_ex)
 {
+	struct extent_tree_info *eti = &sbi->extent_tree[et->type];
 	struct extent_node *en = NULL;
 
-	if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei)) {
+	if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei, et->type)) {
 		prev_ex->ei.len += ei->len;
 		ei = &prev_ex->ei;
 		en = prev_ex;
 	}
 
-	if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) {
+	if (next_ex && __is_front_mergeable(ei, &next_ex->ei, et->type)) {
 		next_ex->ei.fofs = ei->fofs;
-		next_ex->ei.blk = ei->blk;
 		next_ex->ei.len += ei->len;
+		if (et->type == EX_READ)
+			next_ex->ei.blk = ei->blk;
 		if (en)
 			__release_extent_node(sbi, et, prev_ex);
 
@@ -543,12 +577,12 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi,
 
 	__try_update_largest_extent(et, en);
 
-	spin_lock(&sbi->extent_lock);
+	spin_lock(&eti->extent_lock);
 	if (!list_empty(&en->list)) {
-		list_move_tail(&en->list, &sbi->extent_list);
+		list_move_tail(&en->list, &eti->extent_list);
 		et->cached_en = en;
 	}
-	spin_unlock(&sbi->extent_lock);
+	spin_unlock(&eti->extent_lock);
 	return en;
 }
 
@@ -558,6 +592,7 @@ static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi,
 				struct rb_node *insert_parent,
 				bool leftmost)
 {
+	struct extent_tree_info *eti = &sbi->extent_tree[et->type];
 	struct rb_node **p;
 	struct rb_node *parent = NULL;
 	struct extent_node *en = NULL;
@@ -580,47 +615,50 @@ do_insert:
 	__try_update_largest_extent(et, en);
 
 	/* update in global extent list */
-	spin_lock(&sbi->extent_lock);
-	list_add_tail(&en->list, &sbi->extent_list);
+	spin_lock(&eti->extent_lock);
+	list_add_tail(&en->list, &eti->extent_list);
 	et->cached_en = en;
-	spin_unlock(&sbi->extent_lock);
+	spin_unlock(&eti->extent_lock);
 	return en;
 }
 
-static void f2fs_update_extent_tree_range(struct inode *inode,
-				pgoff_t fofs, block_t blkaddr, unsigned int len)
+static void __update_extent_tree_range(struct inode *inode,
+			struct extent_info *tei, enum extent_type type)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct extent_tree *et = F2FS_I(inode)->extent_tree;
+	struct extent_tree *et = F2FS_I(inode)->extent_tree[type];
 	struct extent_node *en = NULL, *en1 = NULL;
 	struct extent_node *prev_en = NULL, *next_en = NULL;
 	struct extent_info ei, dei, prev;
 	struct rb_node **insert_p = NULL, *insert_parent = NULL;
+	unsigned int fofs = tei->fofs, len = tei->len;
 	unsigned int end = fofs + len;
-	unsigned int pos = (unsigned int)fofs;
 	bool updated = false;
 	bool leftmost = false;
 
 	if (!et)
 		return;
 
-	trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len, 0);
-
+	if (type == EX_READ)
+		trace_f2fs_update_read_extent_tree_range(inode, fofs, len,
+						tei->blk, 0);
 	write_lock(&et->lock);
 
-	if (is_inode_flag_set(inode, FI_NO_EXTENT)) {
-		write_unlock(&et->lock);
-		return;
-	}
+	if (type == EX_READ) {
+		if (is_inode_flag_set(inode, FI_NO_EXTENT)) {
+			write_unlock(&et->lock);
+			return;
+		}
 
-	prev = et->largest;
-	dei.len = 0;
+		prev = et->largest;
+		dei.len = 0;
 
-	/*
-	 * drop largest extent before lookup, in case it's already
-	 * been shrunk from extent tree
-	 */
-	__drop_largest_extent(et, fofs, len);
+		/*
+		 * drop largest extent before lookup, in case it's already
+		 * been shrunk from extent tree
+		 */
+		__drop_largest_extent(et, fofs, len);
+	}
 
 	/* 1. lookup first extent node in range [fofs, fofs + len - 1] */
 	en = (struct extent_node *)f2fs_lookup_rb_tree_ret(&et->root,
@@ -641,26 +679,30 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
 
 		dei = en->ei;
 		org_end = dei.fofs + dei.len;
-		f2fs_bug_on(sbi, pos >= org_end);
+		f2fs_bug_on(sbi, fofs >= org_end);
 
-		if (pos > dei.fofs && pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) {
-			en->ei.len = pos - en->ei.fofs;
+		if (fofs > dei.fofs && (type != EX_READ ||
+				fofs - dei.fofs >= F2FS_MIN_EXTENT_LEN)) {
+			en->ei.len = fofs - en->ei.fofs;
 			prev_en = en;
 			parts = 1;
 		}
 
-		if (end < org_end && org_end - end >= F2FS_MIN_EXTENT_LEN) {
+		if (end < org_end && (type != EX_READ ||
+				org_end - end >= F2FS_MIN_EXTENT_LEN)) {
 			if (parts) {
 				__set_extent_info(&ei,
 					end, org_end - end,
-					end - dei.fofs + dei.blk, false);
+					end - dei.fofs + dei.blk, false,
+					type);
 				en1 = __insert_extent_tree(sbi, et, &ei,
 							NULL, NULL, true);
 				next_en = en1;
 			} else {
 				__set_extent_info(&en->ei,
 					end, en->ei.len - (end - dei.fofs),
-					en->ei.blk + (end - dei.fofs), true);
+					en->ei.blk + (end - dei.fofs), true,
+					type);
 				next_en = en;
 			}
 			parts++;
@@ -690,9 +732,11 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
 		en = next_en;
 	}
 
-	/* 3. update extent in extent cache */
-	if (blkaddr) {
-		__set_extent_info(&ei, fofs, len, blkaddr, false);
+	/* 3. update extent in read extent cache */
+	BUG_ON(type != EX_READ);
+
+	if (tei->blk) {
+		__set_extent_info(&ei, fofs, len, tei->blk, false, EX_READ);
 		if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
 			__insert_extent_tree(sbi, et, &ei,
 					insert_p, insert_parent, leftmost);
@@ -722,19 +766,20 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
 }
 
 #ifdef CONFIG_F2FS_FS_COMPRESSION
-void f2fs_update_extent_tree_range_compressed(struct inode *inode,
+void f2fs_update_read_extent_tree_range_compressed(struct inode *inode,
 				pgoff_t fofs, block_t blkaddr, unsigned int llen,
 				unsigned int c_len)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct extent_tree *et = F2FS_I(inode)->extent_tree;
+	struct extent_tree *et = F2FS_I(inode)->extent_tree[EX_READ];
 	struct extent_node *en = NULL;
 	struct extent_node *prev_en = NULL, *next_en = NULL;
 	struct extent_info ei;
 	struct rb_node **insert_p = NULL, *insert_parent = NULL;
 	bool leftmost = false;
 
-	trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, llen, c_len);
+	trace_f2fs_update_read_extent_tree_range(inode, fofs, llen,
+						blkaddr, c_len);
 
 	/* it is safe here to check FI_NO_EXTENT w/o et->lock in ro image */
 	if (is_inode_flag_set(inode, FI_NO_EXTENT))
@@ -751,7 +796,7 @@ void f2fs_update_extent_tree_range_compressed(struct inode *inode,
 	if (en)
 		goto unlock_out;
 
-	__set_extent_info(&ei, fofs, llen, blkaddr, true);
+	__set_extent_info(&ei, fofs, llen, blkaddr, true, EX_READ);
 	ei.c_len = c_len;
 
 	if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
@@ -762,24 +807,43 @@ unlock_out:
 }
 #endif
 
-unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
+static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type type)
 {
+	struct extent_info ei;
+
+	if (!__may_extent_tree(dn->inode, type))
+		return;
+
+	ei.fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) +
+								dn->ofs_in_node;
+	ei.len = 1;
+
+	if (type == EX_READ) {
+		if (dn->data_blkaddr == NEW_ADDR)
+			ei.blk = NULL_ADDR;
+		else
+			ei.blk = dn->data_blkaddr;
+	}
+	__update_extent_tree_range(dn->inode, &ei, type);
+}
+
+static unsigned int __shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink,
+					enum extent_type type)
+{
+	struct extent_tree_info *eti = &sbi->extent_tree[type];
 	struct extent_tree *et, *next;
 	struct extent_node *en;
 	unsigned int node_cnt = 0, tree_cnt = 0;
 	int remained;
 
-	if (!test_opt(sbi, READ_EXTENT_CACHE))
-		return 0;
-
-	if (!atomic_read(&sbi->total_zombie_tree))
+	if (!atomic_read(&eti->total_zombie_tree))
 		goto free_node;
 
-	if (!mutex_trylock(&sbi->extent_tree_lock))
+	if (!mutex_trylock(&eti->extent_tree_lock))
 		goto out;
 
 	/* 1. remove unreferenced extent tree */
-	list_for_each_entry_safe(et, next, &sbi->zombie_list, list) {
+	list_for_each_entry_safe(et, next, &eti->zombie_list, list) {
 		if (atomic_read(&et->node_cnt)) {
 			write_lock(&et->lock);
 			node_cnt += __free_extent_tree(sbi, et);
@@ -787,61 +851,100 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
 		}
 		f2fs_bug_on(sbi, atomic_read(&et->node_cnt));
 		list_del_init(&et->list);
-		radix_tree_delete(&sbi->extent_tree_root, et->ino);
+		radix_tree_delete(&eti->extent_tree_root, et->ino);
 		kmem_cache_free(extent_tree_slab, et);
-		atomic_dec(&sbi->total_ext_tree);
-		atomic_dec(&sbi->total_zombie_tree);
+		atomic_dec(&eti->total_ext_tree);
+		atomic_dec(&eti->total_zombie_tree);
 		tree_cnt++;
 
 		if (node_cnt + tree_cnt >= nr_shrink)
 			goto unlock_out;
 		cond_resched();
 	}
-	mutex_unlock(&sbi->extent_tree_lock);
+	mutex_unlock(&eti->extent_tree_lock);
 
 free_node:
 	/* 2. remove LRU extent entries */
-	if (!mutex_trylock(&sbi->extent_tree_lock))
+	if (!mutex_trylock(&eti->extent_tree_lock))
 		goto out;
 
 	remained = nr_shrink - (node_cnt + tree_cnt);
 
-	spin_lock(&sbi->extent_lock);
+	spin_lock(&eti->extent_lock);
 	for (; remained > 0; remained--) {
-		if (list_empty(&sbi->extent_list))
+		if (list_empty(&eti->extent_list))
 			break;
-		en = list_first_entry(&sbi->extent_list,
+		en = list_first_entry(&eti->extent_list,
 					struct extent_node, list);
 		et = en->et;
 		if (!write_trylock(&et->lock)) {
 			/* refresh this extent node's position in extent list */
-			list_move_tail(&en->list, &sbi->extent_list);
+			list_move_tail(&en->list, &eti->extent_list);
 			continue;
 		}
 
 		list_del_init(&en->list);
-		spin_unlock(&sbi->extent_lock);
+		spin_unlock(&eti->extent_lock);
 
 		__detach_extent_node(sbi, et, en);
 
 		write_unlock(&et->lock);
 		node_cnt++;
-		spin_lock(&sbi->extent_lock);
+		spin_lock(&eti->extent_lock);
 	}
-	spin_unlock(&sbi->extent_lock);
+	spin_unlock(&eti->extent_lock);
 
 unlock_out:
-	mutex_unlock(&sbi->extent_tree_lock);
+	mutex_unlock(&eti->extent_tree_lock);
 out:
-	trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt);
+	trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt, type);
 
 	return node_cnt + tree_cnt;
 }
 
-unsigned int f2fs_destroy_extent_node(struct inode *inode)
+/* read extent cache operations */
+bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs,
+				struct extent_info *ei)
+{
+	if (!__may_extent_tree(inode, EX_READ))
+		return false;
+
+	return __lookup_extent_tree(inode, pgofs, ei, EX_READ);
+}
+
+void f2fs_update_read_extent_cache(struct dnode_of_data *dn)
+{
+	return __update_extent_cache(dn, EX_READ);
+}
+
+void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn,
+				pgoff_t fofs, block_t blkaddr, unsigned int len)
+{
+	struct extent_info ei = {
+		.fofs = fofs,
+		.len = len,
+		.blk = blkaddr,
+	};
+
+	if (!__may_extent_tree(dn->inode, EX_READ))
+		return;
+
+	__update_extent_tree_range(dn->inode, &ei, EX_READ);
+}
+
+unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
+{
+	if (!test_opt(sbi, READ_EXTENT_CACHE))
+		return 0;
+
+	return __shrink_extent_tree(sbi, nr_shrink, EX_READ);
+}
+
+static unsigned int __destroy_extent_node(struct inode *inode,
+					enum extent_type type)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct extent_tree *et = F2FS_I(inode)->extent_tree;
+	struct extent_tree *et = F2FS_I(inode)->extent_tree[type];
 	unsigned int node_cnt = 0;
 
 	if (!et || !atomic_read(&et->node_cnt))
@@ -854,31 +957,44 @@ unsigned int f2fs_destroy_extent_node(struct inode *inode)
 	return node_cnt;
 }
 
-void f2fs_drop_extent_tree(struct inode *inode)
+void f2fs_destroy_extent_node(struct inode *inode)
+{
+	__destroy_extent_node(inode, EX_READ);
+}
+
+static void __drop_extent_tree(struct inode *inode, enum extent_type type)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct extent_tree *et = F2FS_I(inode)->extent_tree;
+	struct extent_tree *et = F2FS_I(inode)->extent_tree[type];
 	bool updated = false;
 
-	if (!f2fs_may_extent_tree(inode))
+	if (!__may_extent_tree(inode, type))
 		return;
 
 	write_lock(&et->lock);
-	set_inode_flag(inode, FI_NO_EXTENT);
 	__free_extent_tree(sbi, et);
-	if (et->largest.len) {
-		et->largest.len = 0;
-		updated = true;
+	if (type == EX_READ) {
+		set_inode_flag(inode, FI_NO_EXTENT);
+		if (et->largest.len) {
+			et->largest.len = 0;
+			updated = true;
+		}
 	}
 	write_unlock(&et->lock);
 	if (updated)
 		f2fs_mark_inode_dirty_sync(inode, true);
 }
 
-void f2fs_destroy_extent_tree(struct inode *inode)
+void f2fs_drop_extent_tree(struct inode *inode)
+{
+	__drop_extent_tree(inode, EX_READ);
+}
+
+static void __destroy_extent_tree(struct inode *inode, enum extent_type type)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct extent_tree *et = F2FS_I(inode)->extent_tree;
+	struct extent_tree_info *eti = &sbi->extent_tree[type];
+	struct extent_tree *et = F2FS_I(inode)->extent_tree[type];
 	unsigned int node_cnt = 0;
 
 	if (!et)
@@ -886,76 +1002,49 @@ void f2fs_destroy_extent_tree(struct inode *inode)
 
 	if (inode->i_nlink && !is_bad_inode(inode) &&
 					atomic_read(&et->node_cnt)) {
-		mutex_lock(&sbi->extent_tree_lock);
-		list_add_tail(&et->list, &sbi->zombie_list);
-		atomic_inc(&sbi->total_zombie_tree);
-		mutex_unlock(&sbi->extent_tree_lock);
+		mutex_lock(&eti->extent_tree_lock);
+		list_add_tail(&et->list, &eti->zombie_list);
+		atomic_inc(&eti->total_zombie_tree);
+		mutex_unlock(&eti->extent_tree_lock);
 		return;
 	}
 
 	/* free all extent info belong to this extent tree */
-	node_cnt = f2fs_destroy_extent_node(inode);
+	node_cnt = __destroy_extent_node(inode, type);
 
 	/* delete extent tree entry in radix tree */
-	mutex_lock(&sbi->extent_tree_lock);
+	mutex_lock(&eti->extent_tree_lock);
 	f2fs_bug_on(sbi, atomic_read(&et->node_cnt));
-	radix_tree_delete(&sbi->extent_tree_root, inode->i_ino);
+	radix_tree_delete(&eti->extent_tree_root, inode->i_ino);
 	kmem_cache_free(extent_tree_slab, et);
-	atomic_dec(&sbi->total_ext_tree);
-	mutex_unlock(&sbi->extent_tree_lock);
+	atomic_dec(&eti->total_ext_tree);
+	mutex_unlock(&eti->extent_tree_lock);
 
-	F2FS_I(inode)->extent_tree = NULL;
+	F2FS_I(inode)->extent_tree[type] = NULL;
 
-	trace_f2fs_destroy_extent_tree(inode, node_cnt);
+	trace_f2fs_destroy_extent_tree(inode, node_cnt, type);
 }
 
-bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs,
-					struct extent_info *ei)
-{
-	if (!f2fs_may_extent_tree(inode))
-		return false;
-
-	return f2fs_lookup_extent_tree(inode, pgofs, ei);
-}
-
-void f2fs_update_extent_cache(struct dnode_of_data *dn)
+void f2fs_destroy_extent_tree(struct inode *inode)
 {
-	pgoff_t fofs;
-	block_t blkaddr;
-
-	if (!f2fs_may_extent_tree(dn->inode))
-		return;
-
-	if (dn->data_blkaddr == NEW_ADDR)
-		blkaddr = NULL_ADDR;
-	else
-		blkaddr = dn->data_blkaddr;
-
-	fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) +
-								dn->ofs_in_node;
-	f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, 1);
+	__destroy_extent_tree(inode, EX_READ);
 }
 
-void f2fs_update_extent_cache_range(struct dnode_of_data *dn,
-				pgoff_t fofs, block_t blkaddr, unsigned int len)
-
+static void __init_extent_tree_info(struct extent_tree_info *eti)
 {
-	if (!f2fs_may_extent_tree(dn->inode))
-		return;
-
-	f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len);
+	INIT_RADIX_TREE(&eti->extent_tree_root, GFP_NOIO);
+	mutex_init(&eti->extent_tree_lock);
+	INIT_LIST_HEAD(&eti->extent_list);
+	spin_lock_init(&eti->extent_lock);
+	atomic_set(&eti->total_ext_tree, 0);
+	INIT_LIST_HEAD(&eti->zombie_list);
+	atomic_set(&eti->total_zombie_tree, 0);
+	atomic_set(&eti->total_ext_node, 0);
 }
 
 void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi)
 {
-	INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO);
-	mutex_init(&sbi->extent_tree_lock);
-	INIT_LIST_HEAD(&sbi->extent_list);
-	spin_lock_init(&sbi->extent_lock);
-	atomic_set(&sbi->total_ext_tree, 0);
-	INIT_LIST_HEAD(&sbi->zombie_list);
-	atomic_set(&sbi->total_zombie_tree, 0);
-	atomic_set(&sbi->total_ext_node, 0);
+	__init_extent_tree_info(&sbi->extent_tree[EX_READ]);
 }
 
 int __init f2fs_create_extent_cache(void)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 04fdf010bb77..7c68bedee649 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -596,16 +596,22 @@ enum {
 /* dirty segments threshold for triggering CP */
 #define DEFAULT_DIRTY_THRESHOLD		4
 
+#define RECOVERY_MAX_RA_BLOCKS		BIO_MAX_VECS
+#define RECOVERY_MIN_RA_BLOCKS		1
+
+#define F2FS_ONSTACK_PAGES	16	/* nr of onstack pages */
+
 /* for in-memory extent cache entry */
 #define F2FS_MIN_EXTENT_LEN	64	/* minimum extent length */
 
 /* number of extent info in extent cache we try to shrink */
 #define READ_EXTENT_CACHE_SHRINK_NUMBER	128
 
-#define RECOVERY_MAX_RA_BLOCKS		BIO_MAX_VECS
-#define RECOVERY_MIN_RA_BLOCKS		1
-
-#define F2FS_ONSTACK_PAGES	16	/* nr of onstack pages */
+/* extent cache type */
+enum extent_type {
+	EX_READ,
+	NR_EXTENT_CACHES,
+};
 
 struct rb_entry {
 	struct rb_node rb_node;		/* rb node located in rb-tree */
@@ -621,10 +627,17 @@ struct rb_entry {
 struct extent_info {
 	unsigned int fofs;		/* start offset in a file */
 	unsigned int len;		/* length of the extent */
-	block_t blk;			/* start block address of the extent */
+	union {
+		/* read extent_cache */
+		struct {
+			/* start block address of the extent */
+			block_t blk;
 #ifdef CONFIG_F2FS_FS_COMPRESSION
-	unsigned int c_len;		/* physical extent length of compressed blocks */
+			/* physical extent length of compressed blocks */
+			unsigned int c_len;
 #endif
+		};
+	};
 };
 
 struct extent_node {
@@ -636,13 +649,25 @@ struct extent_node {
 
 struct extent_tree {
 	nid_t ino;			/* inode number */
+	enum extent_type type;		/* keep the extent tree type */
 	struct rb_root_cached root;	/* root of extent info rb-tree */
 	struct extent_node *cached_en;	/* recently accessed extent node */
-	struct extent_info largest;	/* largested extent info */
 	struct list_head list;		/* to be used by sbi->zombie_list */
 	rwlock_t lock;			/* protect extent info rb-tree */
 	atomic_t node_cnt;		/* # of extent node in rb-tree*/
 	bool largest_updated;		/* largest extent updated */
+	struct extent_info largest;	/* largest cached extent for EX_READ */
+};
+
+struct extent_tree_info {
+	struct radix_tree_root extent_tree_root;/* cache extent cache entries */
+	struct mutex extent_tree_lock;	/* locking extent radix tree */
+	struct list_head extent_list;		/* lru list for shrinker */
+	spinlock_t extent_lock;			/* locking extent lru list */
+	atomic_t total_ext_tree;		/* extent tree count */
+	struct list_head zombie_list;		/* extent zombie tree list */
+	atomic_t total_zombie_tree;		/* extent zombie tree count */
+	atomic_t total_ext_node;		/* extent info count */
 };
 
 /*
@@ -805,7 +830,8 @@ struct f2fs_inode_info {
 	struct list_head dirty_list;	/* dirty list for dirs and files */
 	struct list_head gdirty_list;	/* linked in global dirty list */
 	struct task_struct *atomic_write_task;	/* store atomic write task */
-	struct extent_tree *extent_tree;	/* cached extent_tree entry */
+	struct extent_tree *extent_tree[NR_EXTENT_CACHES];
+					/* cached extent_tree entry */
 	struct inode *cow_inode;	/* copy-on-write inode for atomic write */
 
 	/* avoid racing between foreground op and gc */
@@ -1626,14 +1652,7 @@ struct f2fs_sb_info {
 	struct mutex flush_lock;		/* for flush exclusion */
 
 	/* for extent tree cache */
-	struct radix_tree_root extent_tree_root;/* cache extent cache entries */
-	struct mutex extent_tree_lock;	/* locking extent radix tree */
-	struct list_head extent_list;		/* lru list for shrinker */
-	spinlock_t extent_lock;			/* locking extent lru list */
-	atomic_t total_ext_tree;		/* extent tree count */
-	struct list_head zombie_list;		/* extent zombie tree list */
-	atomic_t total_zombie_tree;		/* extent zombie tree count */
-	atomic_t total_ext_node;		/* extent info count */
+	struct extent_tree_info extent_tree[NR_EXTENT_CACHES];
 
 	/* basic filesystem units */
 	unsigned int log_sectors_per_block;	/* log2 sectors per block */
@@ -1718,10 +1737,14 @@ struct f2fs_sb_info {
 	unsigned int segment_count[2];		/* # of allocated segments */
 	unsigned int block_count[2];		/* # of allocated blocks */
 	atomic_t inplace_count;		/* # of inplace update */
-	atomic64_t total_hit_ext;		/* # of lookup extent cache */
-	atomic64_t read_hit_rbtree;		/* # of hit rbtree extent node */
-	atomic64_t read_hit_largest;		/* # of hit largest extent node */
-	atomic64_t read_hit_cached;		/* # of hit cached extent node */
+	/* # of lookup extent cache */
+	atomic64_t total_hit_ext[NR_EXTENT_CACHES];
+	/* # of hit rbtree extent node */
+	atomic64_t read_hit_rbtree[NR_EXTENT_CACHES];
+	/* # of hit cached extent node */
+	atomic64_t read_hit_cached[NR_EXTENT_CACHES];
+	/* # of hit largest extent node in read extent cache */
+	atomic64_t read_hit_largest;
 	atomic_t inline_xattr;			/* # of inline_xattr inodes */
 	atomic_t inline_inode;			/* # of inline_data inodes */
 	atomic_t inline_dir;			/* # of inline_dentry inodes */
@@ -3823,9 +3846,17 @@ struct f2fs_stat_info {
 	struct f2fs_sb_info *sbi;
 	int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs;
 	int main_area_segs, main_area_sections, main_area_zones;
-	unsigned long long hit_largest, hit_cached, hit_rbtree;
-	unsigned long long hit_total, total_ext;
-	int ext_tree, zombie_tree, ext_node;
+	unsigned long long hit_cached[NR_EXTENT_CACHES];
+	unsigned long long hit_rbtree[NR_EXTENT_CACHES];
+	unsigned long long total_ext[NR_EXTENT_CACHES];
+	unsigned long long hit_total[NR_EXTENT_CACHES];
+	int ext_tree[NR_EXTENT_CACHES];
+	int zombie_tree[NR_EXTENT_CACHES];
+	int ext_node[NR_EXTENT_CACHES];
+	/* to count memory footprint */
+	unsigned long long ext_mem[NR_EXTENT_CACHES];
+	/* for read extent cache */
+	unsigned long long hit_largest;
 	int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta;
 	int ndirty_data, ndirty_qdata;
 	unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all;
@@ -3884,10 +3915,10 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
 #define stat_other_skip_bggc_count(sbi)	((sbi)->other_skip_bggc++)
 #define stat_inc_dirty_inode(sbi, type)	((sbi)->ndirty_inode[type]++)
 #define stat_dec_dirty_inode(sbi, type)	((sbi)->ndirty_inode[type]--)
-#define stat_inc_total_hit(sbi)		(atomic64_inc(&(sbi)->total_hit_ext))
-#define stat_inc_rbtree_node_hit(sbi)	(atomic64_inc(&(sbi)->read_hit_rbtree))
+#define stat_inc_total_hit(sbi, type)		(atomic64_inc(&(sbi)->total_hit_ext[type]))
+#define stat_inc_rbtree_node_hit(sbi, type)	(atomic64_inc(&(sbi)->read_hit_rbtree[type]))
 #define stat_inc_largest_node_hit(sbi)	(atomic64_inc(&(sbi)->read_hit_largest))
-#define stat_inc_cached_node_hit(sbi)	(atomic64_inc(&(sbi)->read_hit_cached))
+#define stat_inc_cached_node_hit(sbi, type)	(atomic64_inc(&(sbi)->read_hit_cached[type]))
 #define stat_inc_inline_xattr(inode)					\
 	do {								\
 		if (f2fs_has_inline_xattr(inode))			\
@@ -4010,10 +4041,10 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi);
 #define stat_other_skip_bggc_count(sbi)			do { } while (0)
 #define stat_inc_dirty_inode(sbi, type)			do { } while (0)
 #define stat_dec_dirty_inode(sbi, type)			do { } while (0)
-#define stat_inc_total_hit(sbi)				do { } while (0)
-#define stat_inc_rbtree_node_hit(sbi)			do { } while (0)
+#define stat_inc_total_hit(sbi, type)			do { } while (0)
+#define stat_inc_rbtree_node_hit(sbi, type)		do { } while (0)
 #define stat_inc_largest_node_hit(sbi)			do { } while (0)
-#define stat_inc_cached_node_hit(sbi)			do { } while (0)
+#define stat_inc_cached_node_hit(sbi, type)		do { } while (0)
 #define stat_inc_inline_xattr(inode)			do { } while (0)
 #define stat_dec_inline_xattr(inode)			do { } while (0)
 #define stat_inc_inline_inode(inode)			do { } while (0)
@@ -4119,20 +4150,23 @@ struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root,
 		bool force, bool *leftmost);
 bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi,
 				struct rb_root_cached *root, bool check_key);
-unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink);
 void f2fs_init_extent_tree(struct inode *inode, struct page *ipage);
 void f2fs_drop_extent_tree(struct inode *inode);
-unsigned int f2fs_destroy_extent_node(struct inode *inode);
+void f2fs_destroy_extent_node(struct inode *inode);
 void f2fs_destroy_extent_tree(struct inode *inode);
-bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs,
-			struct extent_info *ei);
-void f2fs_update_extent_cache(struct dnode_of_data *dn);
-void f2fs_update_extent_cache_range(struct dnode_of_data *dn,
-			pgoff_t fofs, block_t blkaddr, unsigned int len);
 void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi);
 int __init f2fs_create_extent_cache(void);
 void f2fs_destroy_extent_cache(void);
 
+/* read extent cache ops */
+bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs,
+			struct extent_info *ei);
+void f2fs_update_read_extent_cache(struct dnode_of_data *dn);
+void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn,
+			pgoff_t fofs, block_t blkaddr, unsigned int len);
+unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi,
+			int nr_shrink);
+
 /*
  * sysfs.c
  */
@@ -4202,9 +4236,9 @@ int f2fs_write_multi_pages(struct compress_ctx *cc,
 						struct writeback_control *wbc,
 						enum iostat_type io_type);
 int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index);
-void f2fs_update_extent_tree_range_compressed(struct inode *inode,
-				pgoff_t fofs, block_t blkaddr, unsigned int llen,
-				unsigned int c_len);
+void f2fs_update_read_extent_tree_range_compressed(struct inode *inode,
+				pgoff_t fofs, block_t blkaddr,
+				unsigned int llen, unsigned int c_len);
 int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
 				unsigned nr_pages, sector_t *last_block_in_bio,
 				bool is_readahead, bool for_write);
@@ -4285,9 +4319,10 @@ static inline bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi,
 static inline void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi,
 							nid_t ino) { }
 #define inc_compr_inode_stat(inode)		do { } while (0)
-static inline void f2fs_update_extent_tree_range_compressed(struct inode *inode,
-				pgoff_t fofs, block_t blkaddr, unsigned int llen,
-				unsigned int c_len) { }
+static inline void f2fs_update_read_extent_tree_range_compressed(
+				struct inode *inode,
+				pgoff_t fofs, block_t blkaddr,
+				unsigned int llen, unsigned int c_len) { }
 #endif
 
 static inline int set_compress_context(struct inode *inode)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index ab0a0d3730f6..cbe7c24065c7 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -618,7 +618,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
 		 */
 		fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page),
 							dn->inode) + ofs;
-		f2fs_update_extent_cache_range(dn, fofs, 0, len);
+		f2fs_update_read_extent_cache_range(dn, fofs, 0, len);
 		dec_valid_block_count(sbi, dn->inode, nr_free);
 	}
 	dn->ofs_in_node = ofs;
@@ -1496,7 +1496,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
 		f2fs_set_data_blkaddr(dn);
 	}
 
-	f2fs_update_extent_cache_range(dn, start, 0, index - start);
+	f2fs_update_read_extent_cache_range(dn, start, 0, index - start);
 
 	return ret;
 }
@@ -2558,7 +2558,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
 	struct f2fs_map_blocks map = { .m_next_extent = NULL,
 					.m_seg_type = NO_CHECK_TYPE,
 					.m_may_create = false };
-	struct extent_info ei = {0, 0, 0};
+	struct extent_info ei = {0, };
 	pgoff_t pg_start, pg_end, next_pgofs;
 	unsigned int blk_per_seg = sbi->blocks_per_seg;
 	unsigned int total = 0, sec_num;
@@ -2590,7 +2590,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
 	 * lookup mapping info in extent cache, skip defragmenting if physical
 	 * block addresses are continuous.
 	 */
-	if (f2fs_lookup_extent_cache(inode, pg_start, &ei)) {
+	if (f2fs_lookup_read_extent_cache(inode, pg_start, &ei)) {
 		if (ei.fofs + ei.len >= pg_end)
 			goto out;
 	}
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index d19e26b2e875..f0c6506d8975 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1146,7 +1146,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index)
 	struct address_space *mapping = inode->i_mapping;
 	struct dnode_of_data dn;
 	struct page *page;
-	struct extent_info ei = {0, 0, 0};
+	struct extent_info ei = {0, };
 	struct f2fs_io_info fio = {
 		.sbi = sbi,
 		.ino = inode->i_ino,
@@ -1164,7 +1164,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index)
 	if (!page)
 		return -ENOMEM;
 
-	if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+	if (f2fs_lookup_read_extent_cache(inode, index, &ei)) {
 		dn.data_blkaddr = ei.blk + index - ei.fofs;
 		if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr,
 						DATA_GENERIC_ENHANCE_READ))) {
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 2c705c60019b..086f201f15a0 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -262,8 +262,8 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
 		return false;
 	}
 
-	if (fi->extent_tree) {
-		struct extent_info *ei = &fi->extent_tree->largest;
+	if (fi->extent_tree[EX_READ]) {
+		struct extent_info *ei = &fi->extent_tree[EX_READ]->largest;
 
 		if (ei->len &&
 			(!f2fs_is_valid_blkaddr(sbi, ei->blk,
@@ -607,7 +607,7 @@ retry:
 void f2fs_update_inode(struct inode *inode, struct page *node_page)
 {
 	struct f2fs_inode *ri;
-	struct extent_tree *et = F2FS_I(inode)->extent_tree;
+	struct extent_tree *et = F2FS_I(inode)->extent_tree[EX_READ];
 
 	f2fs_wait_on_page_writeback(node_page, NODE, true, true);
 	set_page_dirty(node_page);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 84b147966080..07419c3e42a5 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -86,9 +86,11 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
 		mem_size >>= PAGE_SHIFT;
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
 	} else if (type == READ_EXTENT_CACHE) {
-		mem_size = (atomic_read(&sbi->total_ext_tree) *
+		struct extent_tree_info *eti = &sbi->extent_tree[EX_READ];
+
+		mem_size = (atomic_read(&eti->total_ext_tree) *
 				sizeof(struct extent_tree) +
-				atomic_read(&sbi->total_ext_node) *
+				atomic_read(&eti->total_ext_node) *
 				sizeof(struct extent_node)) >> PAGE_SHIFT;
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
 	} else if (type == DISCARD_CACHE) {
@@ -859,7 +861,7 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
 			blkaddr = data_blkaddr(dn->inode, dn->node_page,
 						dn->ofs_in_node + 1);
 
-		f2fs_update_extent_tree_range_compressed(dn->inode,
+		f2fs_update_read_extent_tree_range_compressed(dn->inode,
 					index, blkaddr,
 					F2FS_I(dn->inode)->i_cluster_size,
 					c_len);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 51de358bc452..8722d1a13c17 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -450,7 +450,8 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg)
 
 	/* try to shrink extent cache when there is no enough memory */
 	if (!f2fs_available_free_memory(sbi, READ_EXTENT_CACHE))
-		f2fs_shrink_extent_tree(sbi, READ_EXTENT_CACHE_SHRINK_NUMBER);
+		f2fs_shrink_read_extent_tree(sbi,
+				READ_EXTENT_CACHE_SHRINK_NUMBER);
 
 	/* check the # of cached NAT entries */
 	if (!f2fs_available_free_memory(sbi, NAT_ENTRIES))
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
index dd3c3c7a90ec..33c490e69ae3 100644
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c
@@ -28,10 +28,13 @@ static unsigned long __count_free_nids(struct f2fs_sb_info *sbi)
 	return count > 0 ? count : 0;
 }
 
-static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi)
+static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi,
+					enum extent_type type)
 {
-	return atomic_read(&sbi->total_zombie_tree) +
-				atomic_read(&sbi->total_ext_node);
+	struct extent_tree_info *eti = &sbi->extent_tree[type];
+
+	return atomic_read(&eti->total_zombie_tree) +
+				atomic_read(&eti->total_ext_node);
 }
 
 unsigned long f2fs_shrink_count(struct shrinker *shrink,
@@ -53,8 +56,8 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink,
 		}
 		spin_unlock(&f2fs_list_lock);
 
-		/* count extent cache entries */
-		count += __count_extent_cache(sbi);
+		/* count read extent cache entries */
+		count += __count_extent_cache(sbi, EX_READ);
 
 		/* count clean nat cache entries */
 		count += __count_nat_entries(sbi);
@@ -99,8 +102,8 @@ unsigned long f2fs_shrink_scan(struct shrinker *shrink,
 
 		sbi->shrinker_run_no = run_no;
 
-		/* shrink extent cache entries */
-		freed += f2fs_shrink_extent_tree(sbi, nr >> 1);
+		/* shrink read extent cache entries */
+		freed += f2fs_shrink_read_extent_tree(sbi, nr >> 1);
 
 		/* shrink clean nat cache entries */
 		if (freed < nr)
@@ -130,7 +133,7 @@ void f2fs_join_shrinker(struct f2fs_sb_info *sbi)
 
 void f2fs_leave_shrinker(struct f2fs_sb_info *sbi)
 {
-	f2fs_shrink_extent_tree(sbi, __count_extent_cache(sbi));
+	f2fs_shrink_read_extent_tree(sbi, __count_extent_cache(sbi, EX_READ));
 
 	spin_lock(&f2fs_list_lock);
 	list_del_init(&sbi->s_list);
-- 
cgit 


From 72840cccc0a1a0a0dc1bb27b669a9111be6d0f6a Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 2 Dec 2022 13:51:09 -0800
Subject: f2fs: allocate the extent_cache by default

Let's allocate it to remove the runtime complexity.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/extent_cache.c | 38 +++++++++++++++++++-------------------
 fs/f2fs/f2fs.h         |  3 ++-
 fs/f2fs/inode.c        |  6 ++++--
 fs/f2fs/namei.c        |  4 ++--
 4 files changed, 27 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 654a14ab8977..305f969e3ad1 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -47,20 +47,23 @@ static bool __may_read_extent_tree(struct inode *inode)
 	return S_ISREG(inode->i_mode);
 }
 
-static bool __may_extent_tree(struct inode *inode, enum extent_type type)
+static bool __init_may_extent_tree(struct inode *inode, enum extent_type type)
 {
-	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	if (type == EX_READ)
+		return __may_read_extent_tree(inode);
+	return false;
+}
 
+static bool __may_extent_tree(struct inode *inode, enum extent_type type)
+{
 	/*
 	 * for recovered files during mount do not create extents
 	 * if shrinker is not registered.
 	 */
-	if (list_empty(&sbi->s_list))
+	if (list_empty(&F2FS_I_SB(inode)->s_list))
 		return false;
 
-	if (type == EX_READ)
-		return __may_read_extent_tree(inode);
-	return false;
+	return __init_may_extent_tree(inode, type);
 }
 
 static void __try_update_largest_extent(struct extent_tree *et,
@@ -439,20 +442,18 @@ static void __drop_largest_extent(struct extent_tree *et,
 	}
 }
 
-/* return true, if inode page is changed */
-static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage,
-							enum extent_type type)
+void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct extent_tree_info *eti = &sbi->extent_tree[type];
-	struct f2fs_extent *i_ext = ipage ? &F2FS_INODE(ipage)->i_ext : NULL;
+	struct extent_tree_info *eti = &sbi->extent_tree[EX_READ];
+	struct f2fs_extent *i_ext = &F2FS_INODE(ipage)->i_ext;
 	struct extent_tree *et;
 	struct extent_node *en;
 	struct extent_info ei;
 
-	if (!__may_extent_tree(inode, type)) {
+	if (!__may_extent_tree(inode, EX_READ)) {
 		/* drop largest read extent */
-		if (type == EX_READ && i_ext && i_ext->len) {
+		if (i_ext && i_ext->len) {
 			f2fs_wait_on_page_writeback(ipage, NODE, true, true);
 			i_ext->len = 0;
 			set_page_dirty(ipage);
@@ -460,13 +461,11 @@ static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage,
 		goto out;
 	}
 
-	et = __grab_extent_tree(inode, type);
+	et = __grab_extent_tree(inode, EX_READ);
 
 	if (!i_ext || !i_ext->len)
 		goto out;
 
-	BUG_ON(type != EX_READ);
-
 	get_read_extent_info(&ei, i_ext);
 
 	write_lock(&et->lock);
@@ -486,14 +485,15 @@ static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage,
 unlock_out:
 	write_unlock(&et->lock);
 out:
-	if (type == EX_READ && !F2FS_I(inode)->extent_tree[EX_READ])
+	if (!F2FS_I(inode)->extent_tree[EX_READ])
 		set_inode_flag(inode, FI_NO_EXTENT);
 }
 
-void f2fs_init_extent_tree(struct inode *inode, struct page *ipage)
+void f2fs_init_extent_tree(struct inode *inode)
 {
 	/* initialize read cache */
-	__f2fs_init_extent_tree(inode, ipage, EX_READ);
+	if (__init_may_extent_tree(inode, EX_READ))
+		__grab_extent_tree(inode, EX_READ);
 }
 
 static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 7c68bedee649..ec52e06f8e61 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -4150,7 +4150,7 @@ struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root,
 		bool force, bool *leftmost);
 bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi,
 				struct rb_root_cached *root, bool check_key);
-void f2fs_init_extent_tree(struct inode *inode, struct page *ipage);
+void f2fs_init_extent_tree(struct inode *inode);
 void f2fs_drop_extent_tree(struct inode *inode);
 void f2fs_destroy_extent_node(struct inode *inode);
 void f2fs_destroy_extent_tree(struct inode *inode);
@@ -4159,6 +4159,7 @@ int __init f2fs_create_extent_cache(void);
 void f2fs_destroy_extent_cache(void);
 
 /* read extent cache ops */
+void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage);
 bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs,
 			struct extent_info *ei);
 void f2fs_update_read_extent_cache(struct dnode_of_data *dn);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 086f201f15a0..c845c16f97d0 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -392,8 +392,6 @@ static int do_read_inode(struct inode *inode)
 	fi->i_pino = le32_to_cpu(ri->i_pino);
 	fi->i_dir_level = ri->i_dir_level;
 
-	f2fs_init_extent_tree(inode, node_page);
-
 	get_inline_info(inode, ri);
 
 	fi->i_extra_isize = f2fs_has_extra_attr(inode) ?
@@ -479,6 +477,10 @@ static int do_read_inode(struct inode *inode)
 	}
 
 	init_idisk_time(inode);
+
+	/* Need all the flag bits */
+	f2fs_init_read_extent_tree(inode, node_page);
+
 	f2fs_put_page(node_page, 1);
 
 	stat_inc_inline_xattr(inode);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 58a91ce8fe08..46de782c2baa 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -284,8 +284,6 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
 	}
 	F2FS_I(inode)->i_inline_xattr_size = xattr_size;
 
-	f2fs_init_extent_tree(inode, NULL);
-
 	F2FS_I(inode)->i_flags =
 		f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED);
 
@@ -311,6 +309,8 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
 
 	f2fs_set_inode_flags(inode);
 
+	f2fs_init_extent_tree(inode);
+
 	trace_f2fs_new_inode(inode, 0);
 	return inode;
 
-- 
cgit 


From 71644dff481180ba024ac4f5cb1f068756357adf Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Thu, 1 Dec 2022 17:37:15 -0800
Subject: f2fs: add block_age-based extent cache

This patch introduces a runtime hot/cold data separation method
for f2fs, in order to improve the accuracy for data temperature
classification, reduce the garbage collection overhead after
long-term data updates.

Enhanced hot/cold data separation can record data block update
frequency as "age" of the extent per inode, and take use of the age
info to indicate better temperature type for data block allocation:
 - It records total data blocks allocated since mount;
 - When file extent has been updated, it calculate the count of data
blocks allocated since last update as the age of the extent;
 - Before the data block allocated, it searches for the age info and
chooses the suitable segment for allocation.

Test and result:
 - Prepare: create about 30000 files
  * 3% for cold files (with cold file extension like .apk, from 3M to 10M)
  * 50% for warm files (with random file extension like .FcDxq, from 1K
to 4M)
  * 47% for hot files (with hot file extension like .db, from 1K to 256K)
 - create(5%)/random update(90%)/delete(5%) the files
  * total write amount is about 70G
  * fsync will be called for .db files, and buffered write will be used
for other files

The storage of test device is large enough(128G) so that it will not
switch to SSR mode during the test.

Benefit: dirty segment count increment reduce about 14%
 - before: Dirty +21110
 - after:  Dirty +18286

Signed-off-by: qixiaoyu1 <qixiaoyu1@xiaomi.com>
Signed-off-by: xiongping1 <xiongping1@xiaomi.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/debug.c        |  21 ++++++
 fs/f2fs/extent_cache.c | 183 ++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/f2fs/f2fs.h         |  38 ++++++++++
 fs/f2fs/file.c         |   1 +
 fs/f2fs/inode.c        |   1 +
 fs/f2fs/node.c         |  10 +--
 fs/f2fs/node.h         |   1 +
 fs/f2fs/segment.c      |  33 +++++++++
 fs/f2fs/shrinker.c     |  10 ++-
 fs/f2fs/super.c        |  14 ++++
 fs/f2fs/sysfs.c        |  24 +++++++
 11 files changed, 329 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index a9baa121d829..8f1ef742551f 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -88,6 +88,9 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 	si->hit_largest = atomic64_read(&sbi->read_hit_largest);
 	si->hit_total[EX_READ] += si->hit_largest;
 
+	/* block age extent_cache only */
+	si->allocated_data_blocks = atomic64_read(&sbi->allocated_data_blocks);
+
 	/* validation check of the segment numbers */
 	si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES);
 	si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS);
@@ -516,6 +519,22 @@ static int stat_show(struct seq_file *s, void *v)
 		seq_printf(s, "  - Inner Struct Count: tree: %d(%d), node: %d\n",
 				si->ext_tree[EX_READ], si->zombie_tree[EX_READ],
 				si->ext_node[EX_READ]);
+		seq_puts(s, "\nExtent Cache (Block Age):\n");
+		seq_printf(s, "  - Allocated Data Blocks: %llu\n",
+				si->allocated_data_blocks);
+		seq_printf(s, "  - Hit Count: L1:%llu L2:%llu\n",
+				si->hit_cached[EX_BLOCK_AGE],
+				si->hit_rbtree[EX_BLOCK_AGE]);
+		seq_printf(s, "  - Hit Ratio: %llu%% (%llu / %llu)\n",
+				!si->total_ext[EX_BLOCK_AGE] ? 0 :
+				div64_u64(si->hit_total[EX_BLOCK_AGE] * 100,
+				si->total_ext[EX_BLOCK_AGE]),
+				si->hit_total[EX_BLOCK_AGE],
+				si->total_ext[EX_BLOCK_AGE]);
+		seq_printf(s, "  - Inner Struct Count: tree: %d(%d), node: %d\n",
+				si->ext_tree[EX_BLOCK_AGE],
+				si->zombie_tree[EX_BLOCK_AGE],
+				si->ext_node[EX_BLOCK_AGE]);
 		seq_puts(s, "\nBalancing F2FS Async:\n");
 		seq_printf(s, "  - DIO (R: %4d, W: %4d)\n",
 			   si->nr_dio_read, si->nr_dio_write);
@@ -586,6 +605,8 @@ static int stat_show(struct seq_file *s, void *v)
 				si->cache_mem >> 10);
 		seq_printf(s, "  - read extent cache: %llu KB\n",
 				si->ext_mem[EX_READ] >> 10);
+		seq_printf(s, "  - block age extent cache: %llu KB\n",
+				si->ext_mem[EX_BLOCK_AGE] >> 10);
 		seq_printf(s, "  - paged : %llu KB\n",
 				si->page_mem >> 10);
 	}
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 305f969e3ad1..1bd38a78ebba 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -6,6 +6,10 @@
  * Copyright (c) 2015 Samsung Electronics
  * Authors: Jaegeuk Kim <jaegeuk@kernel.org>
  *          Chao Yu <chao2.yu@samsung.com>
+ *
+ * block_age-based extent cache added by:
+ * Copyright (c) 2022 xiaomi Co., Ltd.
+ *             http://www.xiaomi.com/
  */
 
 #include <linux/fs.h>
@@ -18,6 +22,7 @@
 static void __set_extent_info(struct extent_info *ei,
 				unsigned int fofs, unsigned int len,
 				block_t blk, bool keep_clen,
+				unsigned long age, unsigned long last_blocks,
 				enum extent_type type)
 {
 	ei->fofs = fofs;
@@ -30,6 +35,9 @@ static void __set_extent_info(struct extent_info *ei,
 #ifdef CONFIG_F2FS_FS_COMPRESSION
 		ei->c_len = 0;
 #endif
+	} else if (type == EX_BLOCK_AGE) {
+		ei->age = age;
+		ei->last_blocks = last_blocks;
 	}
 }
 
@@ -47,10 +55,27 @@ static bool __may_read_extent_tree(struct inode *inode)
 	return S_ISREG(inode->i_mode);
 }
 
+static bool __may_age_extent_tree(struct inode *inode)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+	if (!test_opt(sbi, AGE_EXTENT_CACHE))
+		return false;
+	/* don't cache block age info for cold file */
+	if (is_inode_flag_set(inode, FI_COMPRESSED_FILE))
+		return false;
+	if (file_is_cold(inode))
+		return false;
+
+	return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode);
+}
+
 static bool __init_may_extent_tree(struct inode *inode, enum extent_type type)
 {
 	if (type == EX_READ)
 		return __may_read_extent_tree(inode);
+	else if (type == EX_BLOCK_AGE)
+		return __may_age_extent_tree(inode);
 	return false;
 }
 
@@ -90,6 +115,11 @@ static bool __is_extent_mergeable(struct extent_info *back,
 #endif
 		return (back->fofs + back->len == front->fofs &&
 				back->blk + back->len == front->blk);
+	} else if (type == EX_BLOCK_AGE) {
+		return (back->fofs + back->len == front->fofs &&
+			abs(back->age - front->age) <= SAME_AGE_REGION &&
+			abs(back->last_blocks - front->last_blocks) <=
+							SAME_AGE_REGION);
 	}
 	return false;
 }
@@ -489,11 +519,22 @@ out:
 		set_inode_flag(inode, FI_NO_EXTENT);
 }
 
+void f2fs_init_age_extent_tree(struct inode *inode)
+{
+	if (!__init_may_extent_tree(inode, EX_BLOCK_AGE))
+		return;
+	__grab_extent_tree(inode, EX_BLOCK_AGE);
+}
+
 void f2fs_init_extent_tree(struct inode *inode)
 {
 	/* initialize read cache */
 	if (__init_may_extent_tree(inode, EX_READ))
 		__grab_extent_tree(inode, EX_READ);
+
+	/* initialize block age cache */
+	if (__init_may_extent_tree(inode, EX_BLOCK_AGE))
+		__grab_extent_tree(inode, EX_BLOCK_AGE);
 }
 
 static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
@@ -544,6 +585,8 @@ out:
 
 	if (type == EX_READ)
 		trace_f2fs_lookup_read_extent_tree_end(inode, pgofs, ei);
+	else if (type == EX_BLOCK_AGE)
+		trace_f2fs_lookup_age_extent_tree_end(inode, pgofs, ei);
 	return ret;
 }
 
@@ -642,6 +685,10 @@ static void __update_extent_tree_range(struct inode *inode,
 	if (type == EX_READ)
 		trace_f2fs_update_read_extent_tree_range(inode, fofs, len,
 						tei->blk, 0);
+	else if (type == EX_BLOCK_AGE)
+		trace_f2fs_update_age_extent_tree_range(inode, fofs, len,
+						tei->age, tei->last_blocks);
+
 	write_lock(&et->lock);
 
 	if (type == EX_READ) {
@@ -694,6 +741,7 @@ static void __update_extent_tree_range(struct inode *inode,
 				__set_extent_info(&ei,
 					end, org_end - end,
 					end - dei.fofs + dei.blk, false,
+					dei.age, dei.last_blocks,
 					type);
 				en1 = __insert_extent_tree(sbi, et, &ei,
 							NULL, NULL, true);
@@ -702,6 +750,7 @@ static void __update_extent_tree_range(struct inode *inode,
 				__set_extent_info(&en->ei,
 					end, en->ei.len - (end - dei.fofs),
 					en->ei.blk + (end - dei.fofs), true,
+					dei.age, dei.last_blocks,
 					type);
 				next_en = en;
 			}
@@ -732,11 +781,15 @@ static void __update_extent_tree_range(struct inode *inode,
 		en = next_en;
 	}
 
+	if (type == EX_BLOCK_AGE)
+		goto update_age_extent_cache;
+
 	/* 3. update extent in read extent cache */
 	BUG_ON(type != EX_READ);
 
 	if (tei->blk) {
-		__set_extent_info(&ei, fofs, len, tei->blk, false, EX_READ);
+		__set_extent_info(&ei, fofs, len, tei->blk, false,
+				  0, 0, EX_READ);
 		if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
 			__insert_extent_tree(sbi, et, &ei,
 					insert_p, insert_parent, leftmost);
@@ -758,7 +811,17 @@ static void __update_extent_tree_range(struct inode *inode,
 		et->largest_updated = false;
 		updated = true;
 	}
+	goto out_read_extent_cache;
+update_age_extent_cache:
+	if (!tei->last_blocks)
+		goto out_read_extent_cache;
 
+	__set_extent_info(&ei, fofs, len, 0, false,
+			tei->age, tei->last_blocks, EX_BLOCK_AGE);
+	if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
+		__insert_extent_tree(sbi, et, &ei,
+					insert_p, insert_parent, leftmost);
+out_read_extent_cache:
 	write_unlock(&et->lock);
 
 	if (updated)
@@ -796,7 +859,7 @@ void f2fs_update_read_extent_tree_range_compressed(struct inode *inode,
 	if (en)
 		goto unlock_out;
 
-	__set_extent_info(&ei, fofs, llen, blkaddr, true, EX_READ);
+	__set_extent_info(&ei, fofs, llen, blkaddr, true, 0, 0, EX_READ);
 	ei.c_len = c_len;
 
 	if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
@@ -807,6 +870,72 @@ unlock_out:
 }
 #endif
 
+static unsigned long long __calculate_block_age(unsigned long long new,
+						unsigned long long old)
+{
+	unsigned long long diff;
+
+	diff = (new >= old) ? new - (new - old) : new + (old - new);
+
+	return div_u64(diff * LAST_AGE_WEIGHT, 100);
+}
+
+/* This returns a new age and allocated blocks in ei */
+static int __get_new_block_age(struct inode *inode, struct extent_info *ei)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	loff_t f_size = i_size_read(inode);
+	unsigned long long cur_blocks =
+				atomic64_read(&sbi->allocated_data_blocks);
+
+	/*
+	 * When I/O is not aligned to a PAGE_SIZE, update will happen to the last
+	 * file block even in seq write. So don't record age for newly last file
+	 * block here.
+	 */
+	if ((f_size >> PAGE_SHIFT) == ei->fofs && f_size & (PAGE_SIZE - 1) &&
+			ei->blk == NEW_ADDR)
+		return -EINVAL;
+
+	if (__lookup_extent_tree(inode, ei->fofs, ei, EX_BLOCK_AGE)) {
+		unsigned long long cur_age;
+
+		if (cur_blocks >= ei->last_blocks)
+			cur_age = cur_blocks - ei->last_blocks;
+		else
+			/* allocated_data_blocks overflow */
+			cur_age = ULLONG_MAX - ei->last_blocks + cur_blocks;
+
+		if (ei->age)
+			ei->age = __calculate_block_age(cur_age, ei->age);
+		else
+			ei->age = cur_age;
+		ei->last_blocks = cur_blocks;
+		WARN_ON(ei->age > cur_blocks);
+		return 0;
+	}
+
+	f2fs_bug_on(sbi, ei->blk == NULL_ADDR);
+
+	/* the data block was allocated for the first time */
+	if (ei->blk == NEW_ADDR)
+		goto out;
+
+	if (__is_valid_data_blkaddr(ei->blk) &&
+			!f2fs_is_valid_blkaddr(sbi, ei->blk, DATA_GENERIC_ENHANCE)) {
+		f2fs_bug_on(sbi, 1);
+		return -EINVAL;
+	}
+out:
+	/*
+	 * init block age with zero, this can happen when the block age extent
+	 * was reclaimed due to memory constraint or system reboot
+	 */
+	ei->age = 0;
+	ei->last_blocks = cur_blocks;
+	return 0;
+}
+
 static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type type)
 {
 	struct extent_info ei;
@@ -823,6 +952,10 @@ static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type typ
 			ei.blk = NULL_ADDR;
 		else
 			ei.blk = dn->data_blkaddr;
+	} else if (type == EX_BLOCK_AGE) {
+		ei.blk = dn->data_blkaddr;
+		if (__get_new_block_age(dn->inode, &ei))
+			return;
 	}
 	__update_extent_tree_range(dn->inode, &ei, type);
 }
@@ -940,6 +1073,43 @@ unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi, int nr_shrin
 	return __shrink_extent_tree(sbi, nr_shrink, EX_READ);
 }
 
+/* block age extent cache operations */
+bool f2fs_lookup_age_extent_cache(struct inode *inode, pgoff_t pgofs,
+				struct extent_info *ei)
+{
+	if (!__may_extent_tree(inode, EX_BLOCK_AGE))
+		return false;
+
+	return __lookup_extent_tree(inode, pgofs, ei, EX_BLOCK_AGE);
+}
+
+void f2fs_update_age_extent_cache(struct dnode_of_data *dn)
+{
+	return __update_extent_cache(dn, EX_BLOCK_AGE);
+}
+
+void f2fs_update_age_extent_cache_range(struct dnode_of_data *dn,
+				pgoff_t fofs, unsigned int len)
+{
+	struct extent_info ei = {
+		.fofs = fofs,
+		.len = len,
+	};
+
+	if (!__may_extent_tree(dn->inode, EX_BLOCK_AGE))
+		return;
+
+	__update_extent_tree_range(dn->inode, &ei, EX_BLOCK_AGE);
+}
+
+unsigned int f2fs_shrink_age_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
+{
+	if (!test_opt(sbi, AGE_EXTENT_CACHE))
+		return 0;
+
+	return __shrink_extent_tree(sbi, nr_shrink, EX_BLOCK_AGE);
+}
+
 static unsigned int __destroy_extent_node(struct inode *inode,
 					enum extent_type type)
 {
@@ -960,6 +1130,7 @@ static unsigned int __destroy_extent_node(struct inode *inode,
 void f2fs_destroy_extent_node(struct inode *inode)
 {
 	__destroy_extent_node(inode, EX_READ);
+	__destroy_extent_node(inode, EX_BLOCK_AGE);
 }
 
 static void __drop_extent_tree(struct inode *inode, enum extent_type type)
@@ -988,6 +1159,7 @@ static void __drop_extent_tree(struct inode *inode, enum extent_type type)
 void f2fs_drop_extent_tree(struct inode *inode)
 {
 	__drop_extent_tree(inode, EX_READ);
+	__drop_extent_tree(inode, EX_BLOCK_AGE);
 }
 
 static void __destroy_extent_tree(struct inode *inode, enum extent_type type)
@@ -1028,6 +1200,7 @@ static void __destroy_extent_tree(struct inode *inode, enum extent_type type)
 void f2fs_destroy_extent_tree(struct inode *inode)
 {
 	__destroy_extent_tree(inode, EX_READ);
+	__destroy_extent_tree(inode, EX_BLOCK_AGE);
 }
 
 static void __init_extent_tree_info(struct extent_tree_info *eti)
@@ -1045,6 +1218,12 @@ static void __init_extent_tree_info(struct extent_tree_info *eti)
 void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi)
 {
 	__init_extent_tree_info(&sbi->extent_tree[EX_READ]);
+	__init_extent_tree_info(&sbi->extent_tree[EX_BLOCK_AGE]);
+
+	/* initialize for block age extents */
+	atomic64_set(&sbi->allocated_data_blocks, 0);
+	sbi->hot_data_age_threshold = DEF_HOT_DATA_AGE_THRESHOLD;
+	sbi->warm_data_age_threshold = DEF_WARM_DATA_AGE_THRESHOLD;
 }
 
 int __init f2fs_create_extent_cache(void)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index ec52e06f8e61..e8953c3dc81a 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -107,6 +107,7 @@ extern const char *f2fs_fault_name[FAULT_MAX];
 #define F2FS_MOUNT_MERGE_CHECKPOINT	0x10000000
 #define	F2FS_MOUNT_GC_MERGE		0x20000000
 #define F2FS_MOUNT_COMPRESS_CACHE	0x40000000
+#define F2FS_MOUNT_AGE_EXTENT_CACHE	0x80000000
 
 #define F2FS_OPTION(sbi)	((sbi)->mount_opt)
 #define clear_opt(sbi, option)	(F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option)
@@ -607,9 +608,22 @@ enum {
 /* number of extent info in extent cache we try to shrink */
 #define READ_EXTENT_CACHE_SHRINK_NUMBER	128
 
+/* number of age extent info in extent cache we try to shrink */
+#define AGE_EXTENT_CACHE_SHRINK_NUMBER	128
+#define LAST_AGE_WEIGHT			30
+#define SAME_AGE_REGION			1024
+
+/*
+ * Define data block with age less than 1GB as hot data
+ * define data block with age less than 10GB but more than 1GB as warm data
+ */
+#define DEF_HOT_DATA_AGE_THRESHOLD	262144
+#define DEF_WARM_DATA_AGE_THRESHOLD	2621440
+
 /* extent cache type */
 enum extent_type {
 	EX_READ,
+	EX_BLOCK_AGE,
 	NR_EXTENT_CACHES,
 };
 
@@ -637,6 +651,13 @@ struct extent_info {
 			unsigned int c_len;
 #endif
 		};
+		/* block age extent_cache */
+		struct {
+			/* block age of the extent */
+			unsigned long long age;
+			/* last total blocks allocated */
+			unsigned long long last_blocks;
+		};
 	};
 };
 
@@ -1653,6 +1674,11 @@ struct f2fs_sb_info {
 
 	/* for extent tree cache */
 	struct extent_tree_info extent_tree[NR_EXTENT_CACHES];
+	atomic64_t allocated_data_blocks;	/* for block age extent_cache */
+
+	/* The threshold used for hot and warm data seperation*/
+	unsigned int hot_data_age_threshold;
+	unsigned int warm_data_age_threshold;
 
 	/* basic filesystem units */
 	unsigned int log_sectors_per_block;	/* log2 sectors per block */
@@ -3857,6 +3883,8 @@ struct f2fs_stat_info {
 	unsigned long long ext_mem[NR_EXTENT_CACHES];
 	/* for read extent cache */
 	unsigned long long hit_largest;
+	/* for block age extent cache */
+	unsigned long long allocated_data_blocks;
 	int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta;
 	int ndirty_data, ndirty_qdata;
 	unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all;
@@ -4168,6 +4196,16 @@ void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn,
 unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi,
 			int nr_shrink);
 
+/* block age extent cache ops */
+void f2fs_init_age_extent_tree(struct inode *inode);
+bool f2fs_lookup_age_extent_cache(struct inode *inode, pgoff_t pgofs,
+			struct extent_info *ei);
+void f2fs_update_age_extent_cache(struct dnode_of_data *dn);
+void f2fs_update_age_extent_cache_range(struct dnode_of_data *dn,
+			pgoff_t fofs, unsigned int len);
+unsigned int f2fs_shrink_age_extent_tree(struct f2fs_sb_info *sbi,
+			int nr_shrink);
+
 /*
  * sysfs.c
  */
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index cbe7c24065c7..56c23b5e9d65 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -619,6 +619,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
 		fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page),
 							dn->inode) + ofs;
 		f2fs_update_read_extent_cache_range(dn, fofs, 0, len);
+		f2fs_update_age_extent_cache_range(dn, fofs, nr_free);
 		dec_valid_block_count(sbi, dn->inode, nr_free);
 	}
 	dn->ofs_in_node = ofs;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index c845c16f97d0..ff6cf66ed46b 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -480,6 +480,7 @@ static int do_read_inode(struct inode *inode)
 
 	/* Need all the flag bits */
 	f2fs_init_read_extent_tree(inode, node_page);
+	f2fs_init_age_extent_tree(inode);
 
 	f2fs_put_page(node_page, 1);
 
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 07419c3e42a5..dde4c0458704 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -60,7 +60,7 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
 	avail_ram = val.totalram - val.totalhigh;
 
 	/*
-	 * give 25%, 25%, 50%, 50%, 50% memory for each components respectively
+	 * give 25%, 25%, 50%, 50%, 25%, 25% memory for each components respectively
 	 */
 	if (type == FREE_NIDS) {
 		mem_size = (nm_i->nid_cnt[FREE_NID] *
@@ -85,14 +85,16 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
 						sizeof(struct ino_entry);
 		mem_size >>= PAGE_SHIFT;
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
-	} else if (type == READ_EXTENT_CACHE) {
-		struct extent_tree_info *eti = &sbi->extent_tree[EX_READ];
+	} else if (type == READ_EXTENT_CACHE || type == AGE_EXTENT_CACHE) {
+		enum extent_type etype = type == READ_EXTENT_CACHE ?
+						EX_READ : EX_BLOCK_AGE;
+		struct extent_tree_info *eti = &sbi->extent_tree[etype];
 
 		mem_size = (atomic_read(&eti->total_ext_tree) *
 				sizeof(struct extent_tree) +
 				atomic_read(&eti->total_ext_node) *
 				sizeof(struct extent_node)) >> PAGE_SHIFT;
-		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
+		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
 	} else if (type == DISCARD_CACHE) {
 		mem_size = (atomic_read(&dcc->discard_cmd_cnt) *
 				sizeof(struct discard_cmd)) >> PAGE_SHIFT;
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 0aa48704c77a..99454d46a939 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -147,6 +147,7 @@ enum mem_type {
 	DIRTY_DENTS,	/* indicates dirty dentry pages */
 	INO_ENTRIES,	/* indicates inode entries */
 	READ_EXTENT_CACHE,	/* indicates read extent cache */
+	AGE_EXTENT_CACHE,	/* indicates age extent cache */
 	DISCARD_CACHE,	/* indicates memory of cached discard cmds */
 	COMPRESS_PAGE,	/* indicates memory of cached compressed pages */
 	BASE_CHECK,	/* check kernel status */
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 8722d1a13c17..dee712f7225f 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -453,6 +453,11 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg)
 		f2fs_shrink_read_extent_tree(sbi,
 				READ_EXTENT_CACHE_SHRINK_NUMBER);
 
+	/* try to shrink age extent cache when there is no enough memory */
+	if (!f2fs_available_free_memory(sbi, AGE_EXTENT_CACHE))
+		f2fs_shrink_age_extent_tree(sbi,
+				AGE_EXTENT_CACHE_SHRINK_NUMBER);
+
 	/* check the # of cached NAT entries */
 	if (!f2fs_available_free_memory(sbi, NAT_ENTRIES))
 		f2fs_try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK);
@@ -3151,10 +3156,28 @@ static int __get_segment_type_4(struct f2fs_io_info *fio)
 	}
 }
 
+static int __get_age_segment_type(struct inode *inode, pgoff_t pgofs)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct extent_info ei;
+
+	if (f2fs_lookup_age_extent_cache(inode, pgofs, &ei)) {
+		if (!ei.age)
+			return NO_CHECK_TYPE;
+		if (ei.age <= sbi->hot_data_age_threshold)
+			return CURSEG_HOT_DATA;
+		if (ei.age <= sbi->warm_data_age_threshold)
+			return CURSEG_WARM_DATA;
+		return CURSEG_COLD_DATA;
+	}
+	return NO_CHECK_TYPE;
+}
+
 static int __get_segment_type_6(struct f2fs_io_info *fio)
 {
 	if (fio->type == DATA) {
 		struct inode *inode = fio->page->mapping->host;
+		int type;
 
 		if (is_inode_flag_set(inode, FI_ALIGNED_WRITE))
 			return CURSEG_COLD_DATA_PINNED;
@@ -3169,6 +3192,11 @@ static int __get_segment_type_6(struct f2fs_io_info *fio)
 		}
 		if (file_is_cold(inode) || f2fs_need_compress_data(inode))
 			return CURSEG_COLD_DATA;
+
+		type = __get_age_segment_type(inode, fio->page->index);
+		if (type != NO_CHECK_TYPE)
+			return type;
+
 		if (file_is_hot(inode) ||
 				is_inode_flag_set(inode, FI_HOT_DATA) ||
 				f2fs_is_cow_file(inode))
@@ -3287,6 +3315,9 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 	locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
 	locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr));
 
+	if (IS_DATASEG(type))
+		atomic64_inc(&sbi->allocated_data_blocks);
+
 	up_write(&sit_i->sentry_lock);
 
 	if (page && IS_NODESEG(type)) {
@@ -3414,6 +3445,8 @@ void f2fs_outplace_write_data(struct dnode_of_data *dn,
 	struct f2fs_summary sum;
 
 	f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR);
+	if (fio->io_type == FS_DATA_IO || fio->io_type == FS_CP_DATA_IO)
+		f2fs_update_age_extent_cache(dn);
 	set_summary(&sum, dn->nid, dn->ofs_in_node, fio->version);
 	do_write_page(&sum, fio);
 	f2fs_update_data_blkaddr(dn, fio->new_blkaddr);
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
index 33c490e69ae3..83d6fb97dcae 100644
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c
@@ -59,6 +59,9 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink,
 		/* count read extent cache entries */
 		count += __count_extent_cache(sbi, EX_READ);
 
+		/* count block age extent cache entries */
+		count += __count_extent_cache(sbi, EX_BLOCK_AGE);
+
 		/* count clean nat cache entries */
 		count += __count_nat_entries(sbi);
 
@@ -102,8 +105,11 @@ unsigned long f2fs_shrink_scan(struct shrinker *shrink,
 
 		sbi->shrinker_run_no = run_no;
 
+		/* shrink extent cache entries */
+		freed += f2fs_shrink_age_extent_tree(sbi, nr >> 2);
+
 		/* shrink read extent cache entries */
-		freed += f2fs_shrink_read_extent_tree(sbi, nr >> 1);
+		freed += f2fs_shrink_read_extent_tree(sbi, nr >> 2);
 
 		/* shrink clean nat cache entries */
 		if (freed < nr)
@@ -134,6 +140,8 @@ void f2fs_join_shrinker(struct f2fs_sb_info *sbi)
 void f2fs_leave_shrinker(struct f2fs_sb_info *sbi)
 {
 	f2fs_shrink_read_extent_tree(sbi, __count_extent_cache(sbi, EX_READ));
+	f2fs_shrink_age_extent_tree(sbi,
+				__count_extent_cache(sbi, EX_BLOCK_AGE));
 
 	spin_lock(&f2fs_list_lock);
 	list_del_init(&sbi->s_list);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 412c2e7352c0..180d8b804d13 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -163,6 +163,7 @@ enum {
 	Opt_nogc_merge,
 	Opt_discard_unit,
 	Opt_memory_mode,
+	Opt_age_extent_cache,
 	Opt_err,
 };
 
@@ -241,6 +242,7 @@ static match_table_t f2fs_tokens = {
 	{Opt_nogc_merge, "nogc_merge"},
 	{Opt_discard_unit, "discard_unit=%s"},
 	{Opt_memory_mode, "memory=%s"},
+	{Opt_age_extent_cache, "age_extent_cache"},
 	{Opt_err, NULL},
 };
 
@@ -1257,6 +1259,9 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
 			}
 			kfree(name);
 			break;
+		case Opt_age_extent_cache:
+			set_opt(sbi, AGE_EXTENT_CACHE);
+			break;
 		default:
 			f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value",
 				 p);
@@ -1958,6 +1963,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 		seq_puts(seq, ",extent_cache");
 	else
 		seq_puts(seq, ",noextent_cache");
+	if (test_opt(sbi, AGE_EXTENT_CACHE))
+		seq_puts(seq, ",age_extent_cache");
 	if (test_opt(sbi, DATA_FLUSH))
 		seq_puts(seq, ",data_flush");
 
@@ -2219,6 +2226,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	bool need_restart_flush = false, need_stop_flush = false;
 	bool need_restart_discard = false, need_stop_discard = false;
 	bool no_read_extent_cache = !test_opt(sbi, READ_EXTENT_CACHE);
+	bool no_age_extent_cache = !test_opt(sbi, AGE_EXTENT_CACHE);
 	bool enable_checkpoint = !test_opt(sbi, DISABLE_CHECKPOINT);
 	bool no_io_align = !F2FS_IO_ALIGNED(sbi);
 	bool no_atgc = !test_opt(sbi, ATGC);
@@ -2313,6 +2321,12 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 		f2fs_warn(sbi, "switch extent_cache option is not allowed");
 		goto restore_opts;
 	}
+	/* disallow enable/disable age extent_cache dynamically */
+	if (no_age_extent_cache == !!test_opt(sbi, AGE_EXTENT_CACHE)) {
+		err = -EINVAL;
+		f2fs_warn(sbi, "switch age_extent_cache option is not allowed");
+		goto restore_opts;
+	}
 
 	if (no_io_align == !!F2FS_IO_ALIGNED(sbi)) {
 		err = -EINVAL;
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index a4745d596310..2ab215110596 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -668,6 +668,24 @@ out:
 		return count;
 	}
 
+	if (!strcmp(a->attr.name, "hot_data_age_threshold")) {
+		if (t == 0 || t >= sbi->warm_data_age_threshold)
+			return -EINVAL;
+		if (t == *ui)
+			return count;
+		*ui = (unsigned int)t;
+		return count;
+	}
+
+	if (!strcmp(a->attr.name, "warm_data_age_threshold")) {
+		if (t == 0 || t <= sbi->hot_data_age_threshold)
+			return -EINVAL;
+		if (t == *ui)
+			return count;
+		*ui = (unsigned int)t;
+		return count;
+	}
+
 	*ui = (unsigned int)t;
 
 	return count;
@@ -923,6 +941,10 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, peak_atomic_write, peak_atomic_write);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, committed_atomic_block, committed_atomic_block);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, revoked_atomic_block, revoked_atomic_block);
 
+/* For block age extent cache */
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, hot_data_age_threshold, hot_data_age_threshold);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, warm_data_age_threshold, warm_data_age_threshold);
+
 #define ATTR_LIST(name) (&f2fs_attr_##name.attr)
 static struct attribute *f2fs_attrs[] = {
 	ATTR_LIST(gc_urgent_sleep_time),
@@ -1018,6 +1040,8 @@ static struct attribute *f2fs_attrs[] = {
 	ATTR_LIST(peak_atomic_write),
 	ATTR_LIST(committed_atomic_block),
 	ATTR_LIST(revoked_atomic_block),
+	ATTR_LIST(hot_data_age_threshold),
+	ATTR_LIST(warm_data_age_threshold),
 	NULL,
 };
 ATTRIBUTE_GROUPS(f2fs);
-- 
cgit 


From db8dcd25ec84120d4e57a7f17a566825cec17ae8 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Wed, 7 Dec 2022 13:42:17 +0000
Subject: f2fs: Fix spelling mistake in label: free_bio_enrty_cache ->
 free_bio_entry_cache

There is a spelling mistake in a label name. Fix it.

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 180d8b804d13..c02a717cf880 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -4723,7 +4723,7 @@ static int __init init_f2fs_fs(void)
 		goto free_iostat;
 	err = f2fs_init_bioset();
 	if (err)
-		goto free_bio_enrty_cache;
+		goto free_bio_entry_cache;
 	err = f2fs_init_compress_mempool();
 	if (err)
 		goto free_bioset;
@@ -4740,7 +4740,7 @@ free_compress_mempool:
 	f2fs_destroy_compress_mempool();
 free_bioset:
 	f2fs_destroy_bioset();
-free_bio_enrty_cache:
+free_bio_entry_cache:
 	f2fs_destroy_bio_entry_cache();
 free_iostat:
 	f2fs_destroy_iostat_processing();
-- 
cgit 


From 15e38ee44d50cad264da80ef75626b9224ddc4a3 Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Mon, 5 Dec 2022 22:56:03 +0800
Subject: f2fs: fix iostat parameter for discard

Just like other data we count uses the number of bytes as the basic unit,
but discard uses the number of cmds as the statistical unit. In fact the
discard command contains the number of blocks, so let's change to the
number of bytes as the base unit.

Fixes: b0af6d491a6b ("f2fs: add app/fs io stat")
Signed-off-by: Yangtao Li <frank.li@vivo.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index dee712f7225f..f1845a032885 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1187,7 +1187,7 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi,
 
 		atomic_inc(&dcc->issued_discard);
 
-		f2fs_update_iostat(sbi, NULL, FS_DISCARD, 1);
+		f2fs_update_iostat(sbi, NULL, FS_DISCARD, len * F2FS_BLKSIZE);
 
 		lstart += len;
 		start += len;
-- 
cgit 


From 25547439f1dcc3def6062bd3e69165cd806a594e Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Fri, 2 Dec 2022 12:58:41 +0800
Subject: f2fs: don't call f2fs_issue_discard_timeout() when discard_cmd_cnt is
 0 in f2fs_put_super()

No need to call f2fs_issue_discard_timeout() in f2fs_put_super,
when no discard command requires issue. Since the caller of
f2fs_issue_discard_timeout() usually judges the number of discard
commands before using it. Let's move this logic to
f2fs_issue_discard_timeout().

By the way, use f2fs_realtime_discard_enable to simplify the code.

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Yangtao Li <frank.li@vivo.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 6 ++++--
 fs/f2fs/super.c   | 8 ++------
 2 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index f1845a032885..a9099a754dd2 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1661,6 +1661,9 @@ bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi)
 	struct discard_policy dpolicy;
 	bool dropped;
 
+	if (!atomic_read(&dcc->discard_cmd_cnt))
+		return false;
+
 	__init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT,
 					dcc->discard_granularity);
 	__issue_discard_cmd(sbi, &dpolicy);
@@ -2116,8 +2119,7 @@ static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi)
 	 * Recovery can cache discard commands, so in error path of
 	 * fill_super(), it needs to give a chance to handle them.
 	 */
-	if (unlikely(atomic_read(&dcc->discard_cmd_cnt)))
-		f2fs_issue_discard_timeout(sbi);
+	f2fs_issue_discard_timeout(sbi);
 
 	kfree(dcc);
 	SM_I(sbi)->dcc_info = NULL;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index c02a717cf880..1f812b9ce985 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1581,8 +1581,7 @@ static void f2fs_put_super(struct super_block *sb)
 	/* be sure to wait for any on-going discard commands */
 	dropped = f2fs_issue_discard_timeout(sbi);
 
-	if ((f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) &&
-					!sbi->discard_blks && !dropped) {
+	if (f2fs_realtime_discard_enable(sbi) && !sbi->discard_blks && !dropped) {
 		struct cp_control cpc = {
 			.reason = CP_UMOUNT | CP_TRIMMED,
 		};
@@ -2233,7 +2232,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	bool no_discard = !test_opt(sbi, DISCARD);
 	bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE);
 	bool block_unit_discard = f2fs_block_unit_discard(sbi);
-	struct discard_cmd_control *dcc;
 #ifdef CONFIG_QUOTA
 	int i, j;
 #endif
@@ -2420,10 +2418,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 				goto restore_flush;
 			need_stop_discard = true;
 		} else {
-			dcc = SM_I(sbi)->dcc_info;
 			f2fs_stop_discard_thread(sbi);
-			if (atomic_read(&dcc->discard_cmd_cnt))
-				f2fs_issue_discard_timeout(sbi);
+			f2fs_issue_discard_timeout(sbi);
 			need_restart_discard = true;
 		}
 	}
-- 
cgit 


From 7411143f2021530d7641fbb40daaada4ee63f7e6 Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Tue, 29 Nov 2022 12:15:23 +0800
Subject: f2fs: fix some format WARNING in debug.c and sysfs.c

To fix:

WARNING: function definition argument 'struct f2fs_attr *' should also have an identifier name
+       ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *);

WARNING: return sysfs_emit(...) formats should include a terminating newline
+       return sysfs_emit(buf, "(none)");

WARNING: Prefer 'unsigned int' to bare use of 'unsigned'
+               unsigned npages = NODE_MAPPING(sbi)->nrpages;

WARNING: Missing a blank line after declarations
+               unsigned npages = COMPRESS_MAPPING(sbi)->nrpages;
+               si->page_mem += (unsigned long long)npages << PAGE_SHIFT;

WARNING: quoted string split across lines
+               seq_printf(s, "CP merge (Queued: %4d, Issued: %4d, Total: %4d, "
+                               "Cur time: %4d(ms), Peak time: %4d(ms))\n",

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/debug.c | 45 +++++++++++++++++++++++----------------------
 fs/f2fs/sysfs.c | 10 +++++-----
 2 files changed, 28 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 8f1ef742551f..32af4f0c5735 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -318,18 +318,19 @@ get_cache:
 
 	si->page_mem = 0;
 	if (sbi->node_inode) {
-		unsigned npages = NODE_MAPPING(sbi)->nrpages;
+		unsigned long npages = NODE_MAPPING(sbi)->nrpages;
 
 		si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
 	}
 	if (sbi->meta_inode) {
-		unsigned npages = META_MAPPING(sbi)->nrpages;
+		unsigned long npages = META_MAPPING(sbi)->nrpages;
 
 		si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
 	}
 #ifdef CONFIG_F2FS_FS_COMPRESSION
 	if (sbi->compress_inode) {
-		unsigned npages = COMPRESS_MAPPING(sbi)->nrpages;
+		unsigned long npages = COMPRESS_MAPPING(sbi)->nrpages;
+
 		si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
 	}
 #endif
@@ -477,28 +478,28 @@ static int stat_show(struct seq_file *s, void *v)
 				si->meta_count[META_NAT]);
 		seq_printf(s, "  - ssa blocks : %u\n",
 				si->meta_count[META_SSA]);
-		seq_printf(s, "CP merge (Queued: %4d, Issued: %4d, Total: %4d, "
-				"Cur time: %4d(ms), Peak time: %4d(ms))\n",
-				si->nr_queued_ckpt, si->nr_issued_ckpt,
-				si->nr_total_ckpt, si->cur_ckpt_time,
-				si->peak_ckpt_time);
+		seq_puts(s, "CP merge:\n");
+		seq_printf(s, "  - Queued : %4d\n", si->nr_queued_ckpt);
+		seq_printf(s, "  - Issued : %4d\n", si->nr_issued_ckpt);
+		seq_printf(s, "  - Total : %4d\n", si->nr_total_ckpt);
+		seq_printf(s, "  - Cur time : %4d(ms)\n", si->cur_ckpt_time);
+		seq_printf(s, "  - Peak time : %4d(ms)\n", si->peak_ckpt_time);
 		seq_printf(s, "GC calls: %d (BG: %d)\n",
 			   si->call_count, si->bg_gc);
 		seq_printf(s, "  - data segments : %d (%d)\n",
 				si->data_segs, si->bg_data_segs);
 		seq_printf(s, "  - node segments : %d (%d)\n",
 				si->node_segs, si->bg_node_segs);
-		seq_printf(s, "  - Reclaimed segs : Normal (%d), Idle CB (%d), "
-				"Idle Greedy (%d), Idle AT (%d), "
-				"Urgent High (%d), Urgent Mid (%d), "
-				"Urgent Low (%d)\n",
-				si->sbi->gc_reclaimed_segs[GC_NORMAL],
-				si->sbi->gc_reclaimed_segs[GC_IDLE_CB],
-				si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY],
-				si->sbi->gc_reclaimed_segs[GC_IDLE_AT],
-				si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH],
-				si->sbi->gc_reclaimed_segs[GC_URGENT_MID],
-				si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]);
+		seq_puts(s, "  - Reclaimed segs :\n");
+		seq_printf(s, "    - Normal : %d\n", si->sbi->gc_reclaimed_segs[GC_NORMAL]);
+		seq_printf(s, "    - Idle CB : %d\n", si->sbi->gc_reclaimed_segs[GC_IDLE_CB]);
+		seq_printf(s, "    - Idle Greedy : %d\n",
+				si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY]);
+		seq_printf(s, "    - Idle AT : %d\n", si->sbi->gc_reclaimed_segs[GC_IDLE_AT]);
+		seq_printf(s, "    - Urgent High : %d\n",
+				si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH]);
+		seq_printf(s, "    - Urgent Mid : %d\n", si->sbi->gc_reclaimed_segs[GC_URGENT_MID]);
+		seq_printf(s, "    - Urgent Low : %d\n", si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]);
 		seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks,
 				si->bg_data_blks + si->bg_node_blks);
 		seq_printf(s, "  - data blocks : %d (%d)\n", si->data_blks,
@@ -540,11 +541,11 @@ static int stat_show(struct seq_file *s, void *v)
 			   si->nr_dio_read, si->nr_dio_write);
 		seq_printf(s, "  - IO_R (Data: %4d, Node: %4d, Meta: %4d\n",
 			   si->nr_rd_data, si->nr_rd_node, si->nr_rd_meta);
-		seq_printf(s, "  - IO_W (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), "
-			"Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n",
+		seq_printf(s, "  - IO_W (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), ",
 			   si->nr_wb_cp_data, si->nr_wb_data,
 			   si->nr_flushing, si->nr_flushed,
-			   si->flush_list_empty,
+			   si->flush_list_empty);
+		seq_printf(s, "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n",
 			   si->nr_discarding, si->nr_discarded,
 			   si->nr_discard_cmd, si->undiscard_blks);
 		seq_printf(s, "  - atomic IO: %4d (Max. %4d)\n",
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 2ab215110596..83a366f3ee80 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -53,9 +53,9 @@ static const char *gc_mode_names[MAX_GC_MODE] = {
 
 struct f2fs_attr {
 	struct attribute attr;
-	ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *);
-	ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *,
-			 const char *, size_t);
+	ssize_t (*show)(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf);
+	ssize_t (*store)(struct f2fs_attr *a, struct f2fs_sb_info *sbi,
+			 const char *buf, size_t len);
 	int struct_type;
 	int offset;
 	int id;
@@ -232,13 +232,13 @@ static ssize_t encoding_show(struct f2fs_attr *a,
 			(sb->s_encoding->version >> 8) & 0xff,
 			sb->s_encoding->version & 0xff);
 #endif
-	return sysfs_emit(buf, "(none)");
+	return sysfs_emit(buf, "(none)\n");
 }
 
 static ssize_t mounted_time_sec_show(struct f2fs_attr *a,
 		struct f2fs_sb_info *sbi, char *buf)
 {
-	return sysfs_emit(buf, "%llu", SIT_I(sbi)->mounted_time);
+	return sysfs_emit(buf, "%llu\n", SIT_I(sbi)->mounted_time);
 }
 
 #ifdef CONFIG_F2FS_STAT_FS
-- 
cgit 


From 26a8057a1ada97b528b93fdf3ac4fd03170f1900 Mon Sep 17 00:00:00 2001
From: Yuwei Guan <ssawgyw@gmail.com>
Date: Sun, 11 Dec 2022 21:08:41 +0800
Subject: f2fs: reset wait_ms to default if any of the victims have been
 selected

In non-foreground gc mode, if no victim is selected, the gc process
will wait for no_gc_sleep_time before waking up again. In this
subsequent time, even though a victim will be selected, the gc process
still waits for no_gc_sleep_time before waking up. The configuration
of wait_ms is not reasonable.

After any of the victims have been selected, we need to reset wait_ms to
default sleep time from no_gc_sleep_time.

Signed-off-by: Yuwei Guan <Yuwei.Guan@zeekrlife.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/gc.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index f0c6506d8975..d7a9d84ba57c 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -141,6 +141,10 @@ do_gc:
 			/* don't bother wait_ms by foreground gc */
 			if (!foreground)
 				wait_ms = gc_th->no_gc_sleep_time;
+		} else {
+			/* reset wait_ms to default sleep time */
+			if (wait_ms == gc_th->no_gc_sleep_time)
+				wait_ms = gc_th->min_sleep_time;
 		}
 
 		if (foreground)
-- 
cgit 


From 40306b4d1ba25970dafd53432e8daa5d591ebd99 Mon Sep 17 00:00:00 2001
From: Yuezhang Mo <Yuezhang.Mo@sony.com>
Date: Tue, 13 Dec 2022 09:40:32 +0800
Subject: exfat: fix overflow in sector and cluster conversion

According to the exFAT specification, there are at most 2^32-11
clusters in a volume. so using 'int' is not enough for cluster
index, the return value type of exfat_sector_to_cluster() should
be 'unsigned int'.

Signed-off-by: Yuezhang Mo <Yuezhang.Mo@sony.com>
Reviewed-by: Sungjong Seo <sj1557.seo@samsung.com>
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
---
 fs/exfat/exfat_fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index a1e7feb22079..bc6d21d7c5ad 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -400,7 +400,7 @@ static inline sector_t exfat_cluster_to_sector(struct exfat_sb_info *sbi,
 		sbi->data_start_sector;
 }
 
-static inline int exfat_sector_to_cluster(struct exfat_sb_info *sbi,
+static inline unsigned int exfat_sector_to_cluster(struct exfat_sb_info *sbi,
 		sector_t sec)
 {
 	return ((sec - sbi->data_start_sector) >> sbi->sect_per_clus_bits) +
-- 
cgit 


From 36955d368dc101be885ad2c71618e3c3a93cd8ee Mon Sep 17 00:00:00 2001
From: Yuezhang Mo <Yuezhang.Mo@sony.com>
Date: Thu, 17 Nov 2022 11:31:30 +0800
Subject: exfat: reuse exfat_find_location() to simplify exfat_get_dentry_set()

In exfat_get_dentry_set(), part of the code is the same as
exfat_find_location(), reuse exfat_find_location() to simplify
exfat_get_dentry_set().

Code refinement, no functional changes.

Signed-off-by: Yuezhang Mo <Yuezhang.Mo@sony.com>
Reviewed-by: Andy Wu <Andy.Wu@sony.com>
Reviewed-by: Aoyama Wataru <wataru.aoyama@sony.com>
Reviewed-by: Sungjong Seo <sj1557.seo@samsung.com>
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
---
 fs/exfat/dir.c | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index 8121a7e073bc..1dfa67f307f1 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -818,7 +818,7 @@ int exfat_get_dentry_set(struct exfat_entry_set_cache *es,
 		unsigned int type)
 {
 	int ret, i, num_bh;
-	unsigned int off, byte_offset, clu = 0;
+	unsigned int off;
 	sector_t sec;
 	struct exfat_sb_info *sbi = EXFAT_SB(sb);
 	struct exfat_dentry *ep;
@@ -831,27 +831,16 @@ int exfat_get_dentry_set(struct exfat_entry_set_cache *es,
 		return -EIO;
 	}
 
-	byte_offset = EXFAT_DEN_TO_B(entry);
-	ret = exfat_walk_fat_chain(sb, p_dir, byte_offset, &clu);
+	ret = exfat_find_location(sb, p_dir, entry, &sec, &off);
 	if (ret)
 		return ret;
 
 	memset(es, 0, sizeof(*es));
 	es->sb = sb;
 	es->modified = false;
-
-	/* byte offset in cluster */
-	byte_offset = EXFAT_CLU_OFFSET(byte_offset, sbi);
-
-	/* byte offset in sector */
-	off = EXFAT_BLK_OFFSET(byte_offset, sb);
 	es->start_off = off;
 	es->bh = es->__bh;
 
-	/* sector offset in cluster */
-	sec = EXFAT_B_TO_BLK(byte_offset, sb);
-	sec += exfat_cluster_to_sector(sbi, clu);
-
 	bh = sb_bread(sb, sec);
 	if (!bh)
 		return -EIO;
@@ -878,6 +867,8 @@ int exfat_get_dentry_set(struct exfat_entry_set_cache *es,
 	for (i = 1; i < num_bh; i++) {
 		/* get the next sector */
 		if (exfat_is_last_sector_in_cluster(sbi, sec)) {
+			unsigned int clu = exfat_sector_to_cluster(sbi, sec);
+
 			if (p_dir->flags == ALLOC_NO_FAT_CHAIN)
 				clu++;
 			else if (exfat_get_next_cluster(sb, &clu))
-- 
cgit 


From d74f4a3f6d88a2416564bc6bf937e423a4ae8f8e Mon Sep 17 00:00:00 2001
From: Yang Li <yang.lee@linux.alibaba.com>
Date: Wed, 14 Dec 2022 10:39:11 +0800
Subject: cifs: Remove duplicated include in cifsglob.h

./fs/cifs/cifsglob.h: linux/scatterlist.h is included more than once.

Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=3459
Fixes: f7f291e14dde ("cifs: fix oops during encryption")
Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Signed-off-by: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/cifs/cifsglob.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 703685e2db5e..82f2d3070c26 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -23,7 +23,6 @@
 #include "cifs_fs_sb.h"
 #include "cifsacl.h"
 #include <crypto/internal/hash.h>
-#include <linux/scatterlist.h>
 #include <uapi/linux/cifs/cifs_mount.h>
 #include "../smbfs_common/smb2pdu.h"
 #include "smb2pdu.h"
-- 
cgit